# Data ETL (Extract, Transform, Load)

In [1]:
import math
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [2]:
# measured depth cutoffs, source data filenames, fields, parts of field
dict_wells = {
              'MC348-3ST1': [(25654, 25834), 'MC348-3ST1 Appomattox NE_LQC.csv', 'Appomattox',   'NE'  ],
              'MC391-1'   : [(25402, 26080), 'MC391-1 Appomattox SW_LQC.csv',    'Appomattox',   'SW'  ],
              'MC392-1'   : [(24463, 24990), 'MC392-1 Appomattox SE_LQC.csv',    'Appomattox',   'SE'  ],
              'MC392-1ST1': [(25987, 26809), 'MC392-1ST1 Appomattox SE_LQC.csv', 'Appomattox',   'SE'  ],
              'MC393-1'   : [(25378, 26385), 'MC393-1 Vicksburg A_LQC.csv',      'Vicksburg A',  np.nan],
              'MC525-2'   : [(25113, 26101), 'MC525-2 Rydberg Deep_LQC.csv',     'Rydberg Deep', np.nan],
              'MC612-1BP1': [(27611, 28695), 'MC612-1BP1 Dover_LQC.csv',         'Dover',        np.nan]
             }

In [3]:
# get header into one line
rename_map =   {'MD': 'MD, ft',
                'wellName': 'wellName_old',
                'DENSITY': 'DENSITY, g/cm3',
                'DTC': 'DTC, us/ft',             # delta time compressional
                'DTS': 'DTS, US/F',              # delta time shear
                'FLD': 'FLD, unitless',          # fluid
                'GR': 'GR, gAPI',
                'NEUTRON': 'NEUTRON, CFCF',
                'NMR_BW': 'NMR_BW, v/v',         # bound water
                'NMR_CBW': 'NMR_CBW, v/v',       # clay bound water
                'NMR_FF': 'NMR_FF, v/v',         # free fluid
                'NMR_TPOR': 'NMR_TPOR, v/v',     # total porosity
                'RDEEP': 'RDEEP, ohm.m',         # less affected by drilling process (we will use this one)
                'RMEDIUM': 'RMEDIUM, ohm.m',     #
                'RSHALLOW': 'RSHALLOW, ohm.m',   # affected by drilling process the most
                'TVD': 'TVD, ft',
                'TVDBML': 'TVDBML, ft',
                'TVDSS': 'TVDSS, ft'}            # When comparing between well – TVD is more useful (TVDSS?)

In [4]:
# f(x) to create, if missing, and move RMEDIUM and RSHALLOW to the end (to have the same sequence of columns in all df)
# move least interesting columns to the end; 'wellName_old' renamed and moved to and as there is shorter version 'Well name'
# 'datasetName' has only one unique value - keeping it as historical
def rearrange_columns(df):
    
    start_cols = ['Well name', 'Field', 'Field part']
    end_cols = ['Filename', 'wellName_old', 'datasetName', 'RMEDIUM, ohm.m', 'RSHALLOW, ohm.m']
        
    for col in start_cols + end_cols:
        if col not in df:
            df[col] = np.nan
    
    return df[start_cols + [c for c in df if c not in end_cols and c not in start_cols] + end_cols]

In [5]:
# read Excel files with data
wdir = 'csv/'
data = []
for key in dict_wells:
    df = pd.read_csv(wdir + dict_wells[key][1], dtype='str')    
    df = df.rename(columns = rename_map)
    df = df.drop(0)
    #df = df.replace(np.nan, 'None')
    df = df.replace('-9999', np.nan)
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df[(df['MD, ft'] >= dict_wells[key][0][0]) & (df['MD, ft'] <= dict_wells[key][0][1])]
    df['Well name'] = key
    df['Field'] = dict_wells[key][2]
    df['Field part'] = dict_wells[key][3]
    df['Filename'] = dict_wells[key][1]
    df = rearrange_columns(df)
        
    print('WELL {}'.format(key))
    print(df.dtypes)
    print('*'*100)
        
    data.append(df)

WELL MC348-3ST1
Well name           object
Field               object
Field part          object
MD, ft             float64
DENSITY, g/cm3     float64
DTC, us/ft         float64
DTS, US/F          float64
FLD, unitless      float64
GR, gAPI           float64
NEUTRON, CFCF      float64
NMR_BW, v/v        float64
NMR_CBW, v/v       float64
NMR_FF, v/v        float64
NMR_TPOR, v/v      float64
RDEEP, ohm.m       float64
TVD, ft            float64
TVDBML, ft         float64
TVDSS, ft          float64
Filename            object
wellName_old        object
datasetName         object
RMEDIUM, ohm.m     float64
RSHALLOW, ohm.m    float64
dtype: object
****************************************************************************************************
WELL MC391-1
Well name           object
Field               object
Field part          object
MD, ft             float64
DENSITY, g/cm3     float64
DTC, us/ft         float64
DTS, US/F          float64
FLD, unitless        int64
GR, gAPI           

In [6]:
# Normalize neutron porosity to fractions if it's in percentage
for df in data:
    if df['NEUTRON, CFCF'].max() > 1:
        df['NEUTRON, CFCF'] = df['NEUTRON, CFCF'] / 100
    print(df['NEUTRON, CFCF'].min(), df['NEUTRON, CFCF'].max())

0.134075 0.251625
0.11034700000000001 0.34739000000000003
0.11412 0.32618
0.09367353 0.3542884
0.16653300000000001 0.385244
0.1879767 0.3121433
0.15607079999999998 0.3502145


In [7]:
print('Shape of files for each well:')
for df in data:
    print('\t', df.shape)

data = pd.concat(data).reset_index(drop=True)
print('Final shape:', data.shape)

Shape of files for each well:
	 (721, 23)
	 (2713, 23)
	 (1055, 23)
	 (3286, 23)
	 (4029, 23)
	 (1977, 23)
	 (2175, 23)
Final shape: (15956, 23)


In [8]:
# Matt Knuth: Negative values there don’t have any meaning. Typically they would be very low values
# that have been made negative by temperature correction
data.loc[data['NMR_FF, v/v'] < 0, 'NMR_FF, v/v'] = 0

In [9]:
# Matt Knuth: take the log10 of resistivity
def to_log(value):
    
    if pd.isnull(value):
        return value
    else:
        return math.log10(value)
    
data['RDEEP, ohm.m'] = data['RDEEP, ohm.m'].apply(lambda x: to_log(x))
data['RMEDIUM, ohm.m'] = data['RMEDIUM, ohm.m'].apply(lambda x: to_log(x))
data['RSHALLOW, ohm.m'] = data['RSHALLOW, ohm.m'].apply(lambda x: to_log(x))

In [10]:
# DESCRIBE COLUMNS
def describe_columns(df):
    
    for col in df.columns:
        print('\n{}\n\tdtype: {}\n\tAll: {}\n\tUnique: {}\n\tMissing: {}'.format(col,
                                                                              df[~(df[col].isna())][col].dtype,
                                                                              len(df[col].tolist()),
                                                                              len(df[col].unique()),
                                                                              df[col].isna().sum()))
        if df[col].dtype == 'float64':
            print('\tMin:', df[col].min())
            print('\tMax:', df[col].max())
            print('\tMean: {}\n'.format(df[col].mean()))
        print(df[col].value_counts())
        print('*'*100)

In [11]:
data.head(50)

Unnamed: 0,Well name,Field,Field part,"MD, ft","DENSITY, g/cm3","DTC, us/ft","DTS, US/F","FLD, unitless","GR, gAPI","NEUTRON, CFCF","NMR_BW, v/v","NMR_CBW, v/v","NMR_FF, v/v","NMR_TPOR, v/v","RDEEP, ohm.m","TVD, ft","TVDBML, ft","TVDSS, ft",Filename,wellName_old,datasetName,"RMEDIUM, ohm.m","RSHALLOW, ohm.m"
0,MC348-3ST1,Appomattox,NE,25654.0,3.1089,66.67333,100.4266,2.0,26.3041,0.251625,0.00265,0.01018,0.00151,0.01434,0.293561,25215.8,17884.8,25140.8,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.424346,
1,MC348-3ST1,Appomattox,NE,25654.25,3.1126,66.94559,100.8265,2.0,23.341801,0.22285,0.00308,0.0221,0.00217,0.02735,0.365731,25216.02,17885.02,25141.02,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.416532,
2,MC348-3ST1,Appomattox,NE,25654.5,3.0757,67.21785,101.2297,2.0,21.16909,0.20355,0.00337,0.01907,0.002365,0.02481,0.512044,25216.24,17885.24,25141.24,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.253025,
3,MC348-3ST1,Appomattox,NE,25654.75,3.0079,67.48043,102.218,2.0,20.1453,0.187125,0.00467,0.01142,0.003945,0.02004,0.681784,25216.46,17885.46,25141.46,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.145016,
4,MC348-3ST1,Appomattox,NE,25655.0,2.9167,67.74301,103.2258,2.0,18.4986,0.173575,0.01121,0.0138,0.01279,0.0378,0.827608,25216.68,17885.68,25141.68,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.094094,
5,MC348-3ST1,Appomattox,NE,25655.25,2.7429,67.84351,105.8405,2.0,17.04895,0.1673,0.01544,0.02446,0.02743,0.067325,0.900127,25216.9,17885.9,25141.9,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.077713,
6,MC348-3ST1,Appomattox,NE,25655.5,2.6334,67.94402,108.5911,2.0,16.085649,0.1683,0.016635,0.03165,0.040665,0.088945,0.904802,25217.13,17886.13,25142.13,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.076939,
7,MC348-3ST1,Appomattox,NE,25655.75,2.5835,69.3002,109.1243,2.0,16.102949,0.16885,0.018535,0.022375,0.05181,0.09272,0.876887,25217.34,17886.34,25142.34,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.044438,
8,MC348-3ST1,Appomattox,NE,25656.0,2.5735,70.65639,109.6628,2.0,16.7272,0.16895,0.02494,0.00662,0.05311,0.08467,0.885859,25217.56,17886.56,25142.56,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.016076,
9,MC348-3ST1,Appomattox,NE,25656.25,2.5424,72.24492,110.5195,2.0,17.8512,0.166975,0.027725,0.002545,0.074975,0.105245,0.927304,25217.79,17886.79,25142.79,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,1.000287,


In [12]:
describe_columns(data)


Well name
	dtype: object
	All: 15956
	Unique: 7
	Missing: 0
MC393-1       4029
MC392-1ST1    3286
MC391-1       2713
MC612-1BP1    2175
MC525-2       1977
MC392-1       1055
MC348-3ST1     721
Name: Well name, dtype: int64
****************************************************************************************************

Field
	dtype: object
	All: 15956
	Unique: 4
	Missing: 0
Appomattox      7775
Vicksburg A     4029
Dover           2175
Rydberg Deep    1977
Name: Field, dtype: int64
****************************************************************************************************

Field part
	dtype: object
	All: 15956
	Unique: 4
	Missing: 8181
SE    4341
SW    2713
NE     721
Name: Field part, dtype: int64
****************************************************************************************************

MD, ft
	dtype: float64
	All: 15956
	Unique: 11067
	Missing: 0
	Min: 24463.0
	Max: 28694.6030412954
	Mean: 26156.911958809298

25691.000000    4
25785.000000    4
25667.000000  

In [13]:
# create a copy of data and scale
# Matt Knuth: Standardizing with mean & std is better than minmax scaling because it can preserve more data
data_original = deepcopy(data)
to_stay = ['Well name', 'Field', 'Field part', 'Filename', 'wellName_old', 'datasetName',
          'TVD, ft', 'TVDBML, ft', 'TVDSS, ft', 'MD, ft', 'FLD, unitless']
to_scale = [col for col in data.columns if col not in to_stay]

scaler = StandardScaler()
data[to_scale] = scaler.fit_transform(data[to_scale])

In [14]:
data.head(50)

Unnamed: 0,Well name,Field,Field part,"MD, ft","DENSITY, g/cm3","DTC, us/ft","DTS, US/F","FLD, unitless","GR, gAPI","NEUTRON, CFCF","NMR_BW, v/v","NMR_CBW, v/v","NMR_FF, v/v","NMR_TPOR, v/v","RDEEP, ohm.m","TVD, ft","TVDBML, ft","TVDSS, ft",Filename,wellName_old,datasetName,"RMEDIUM, ohm.m","RSHALLOW, ohm.m"
0,MC348-3ST1,Appomattox,NE,25654.0,11.810724,-3.282618,-3.29202,2.0,-1.185375,-0.40633,-2.059905,0.044636,-3.042459,-4.929566,0.041163,25215.8,17884.8,25140.8,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,3.532304,
1,MC348-3ST1,Appomattox,NE,25654.25,11.862163,-3.241252,-3.26611,2.0,-1.671831,-1.135766,-2.043275,1.17222,-3.029381,-4.618738,0.175006,25216.02,17885.02,25141.02,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,3.514227,
2,MC348-3ST1,Appomattox,NE,25654.5,11.349167,-3.199886,-3.239986,2.0,-2.028623,-1.625015,-2.032059,0.885594,-3.025518,-4.679422,0.446352,25216.24,17885.24,25141.24,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,3.135936,
3,MC348-3ST1,Appomattox,NE,25654.75,10.40659,-3.15999,-3.175953,2.0,-2.196745,-2.041383,-1.981783,0.161935,-2.994211,-4.793384,0.761145,25216.46,17885.46,25141.46,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.886046,
4,MC348-3ST1,Appomattox,NE,25655.0,9.138699,-3.120095,-3.110656,2.0,-2.467159,-2.384871,-1.728851,0.387073,-2.818953,-4.369072,1.031585,25216.68,17885.68,25141.68,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.768231,
5,MC348-3ST1,Appomattox,NE,25655.25,6.722475,-3.104825,-2.941246,2.0,-2.705214,-2.54394,-1.565258,1.395467,-2.52887,-3.663677,1.166075,25216.9,17885.9,25141.9,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.730333,
6,MC348-3ST1,Appomattox,NE,25655.5,5.200171,-3.089554,-2.763031,2.0,-2.863403,-2.51859,-1.519042,2.075612,-2.266627,-3.147144,1.174745,25217.13,17886.13,25142.13,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.728542,
7,MC348-3ST1,Appomattox,NE,25655.75,4.506445,-2.883502,-2.728484,2.0,-2.860562,-2.504648,-1.445561,1.198234,-2.045796,-3.056953,1.122976,25217.34,17886.34,25142.34,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.653347,
8,MC348-3ST1,Appomattox,NE,25656.0,4.367422,-2.677447,-2.693594,2.0,-2.75805,-2.502113,-1.19785,-0.292126,-2.020037,-3.24928,1.139613,25217.56,17886.56,25142.56,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.587729,
9,MC348-3ST1,Appomattox,NE,25656.25,3.93506,-2.436092,-2.638087,2.0,-2.573472,-2.552179,-1.090142,-0.677605,-1.586796,-2.757713,1.216476,25217.79,17886.79,25142.79,MC348-3ST1 Appomattox NE_LQC.csv,MC348-3ST1 Appomattox NE,LQC,2.551199,


In [15]:
describe_columns(data)


Well name
	dtype: object
	All: 15956
	Unique: 7
	Missing: 0
MC393-1       4029
MC392-1ST1    3286
MC391-1       2713
MC612-1BP1    2175
MC525-2       1977
MC392-1       1055
MC348-3ST1     721
Name: Well name, dtype: int64
****************************************************************************************************

Field
	dtype: object
	All: 15956
	Unique: 4
	Missing: 0
Appomattox      7775
Vicksburg A     4029
Dover           2175
Rydberg Deep    1977
Name: Field, dtype: int64
****************************************************************************************************

Field part
	dtype: object
	All: 15956
	Unique: 4
	Missing: 8181
SE    4341
SW    2713
NE     721
Name: Field part, dtype: int64
****************************************************************************************************

MD, ft
	dtype: float64
	All: 15956
	Unique: 11067
	Missing: 0
	Min: 24463.0
	Max: 28694.6030412954
	Mean: 26156.911958809298

25691.000000    4
25785.000000    4
25667.000000  

	Min: -1.9754849690293936
	Max: 2.9144491936626085
	Mean: 0.0

-1.396285    3
 0.311445    2
-1.668754    2
-1.615988    2
-1.485096    2
            ..
-0.630153    1
-1.006260    1
-1.525264    1
 0.143436    1
-1.573438    1
Name: RSHALLOW, ohm.m, Length: 8472, dtype: int64
****************************************************************************************************


In [18]:
data_original.to_csv('norphlet_original.csv', index=False, encoding='utf8')
data.to_csv('norphlet_scaled.csv', index=False, encoding='utf8')

__Recommendations by Matt Knuth (05/04/2020 & 05/24/2020)__  
* We should be able to classify what we see (Matt's words)
* Standardizing with mean & std is better than minmax scaling because it can preserve more data, but take the lg_10 of resistivity. Do it for all the wells
* Dover well's top is good, bottom bad (NMR_FF - poor quality), there is also a water leg - look for FLD (fluid) 1
Split the scope of work by task, not by wells
* Segmentation within one well - look at the variability of data (five 50-feet segments in one well are representative to describe this well) - if I am trying to classify this, what is a representative interval (top 100 feet in this well (don't need the NMR for this, only GR, D, R' reason - NMR_TPOR is correlated with D; if use NMR, then it should be NMR_FF because it remove microporosity; assigning weights can also help, e.g. NMR_FF + NMR_ = NMR_TPOR
* Depositional facies vary by 5 ft - resolution too small, but if we find that these 100 feet are better quality, we can segment it this way
* 3921st1, 3931 deviated; in FLD column, 2 = oil, 1 = water

* All logs are correlated with porosity - use the ratio DTC / DTS because it will less sensitive to it
* NMR_FF may be dependent on e.g. DTC / DTS and resistivity
* Matt to provide stratigraphic subzones, climatic zones, formulat to estimate permeability, but don't use the formula because it's a derived value and it will be correlated with the existing logs anyway