Copyright 2019 Lewis Matthews License: Creative Commons Attribution 4.0 International https://creativecommons.org/licenses/by/4.0/

This workflow was written to be accompanied by a bootcamp. 

www.wtdsi.org

www.crownquest.com

In [None]:
# Import toolboxes needed for workflow
import pandas as pd

from pandas import set_option
set_option('display.max_columns',100)
set_option('display.max_rows',1000)

import numpy as np

import operator

import os
import re

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR
from sklearn.metrics.scorer import SCORERS
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split

from bokeh.io import push_notebook, show, output_notebook
from bokeh.models import ColumnDataSource, ColorBar
from bokeh.palettes import Spectral6
from bokeh.transform import linear_cmap
from bokeh.plotting import figure
output_notebook()

In [None]:
# Import vertical wells.

VertA = pd.read_excel('VertA_merge.xlsx')
VertB = pd.read_excel('VertB_merge.xlsx')
VertC = pd.read_excel('VertC_merge.xlsx')
VertD = pd.read_excel('VertD_merge.xlsx')
VertE = pd.read_excel('VertE_merge.xlsx')

In [None]:
# Take a look at VertA

VertA.describe()

In [None]:
# Take a look at VertB

VertB.describe()

In [None]:
# Take a look at VertC

VertC.describe()

In [None]:
# Take a look at VertD

VertD.describe()

In [None]:
# Take a look at VertE

VertE.describe()

In [None]:
# Correct porosity units NPRL:1 = NPOR = NPOR:1
# These values need to be made the same.
VertA['NPOR:1'] = VertA['NPOR:1']/100
VertB['NPRL:1'] = VertB['NPRL:1']/100
VertD['NPRL:1'] = VertD['NPRL:1']/100

In [None]:
# Standardize column names.
# RPM column for VertA is all zeros. 
# We might be able to use 'TOP_DRIVE_RPM' as a substitute.

VertA = VertA[['DEPT','DT35','GMSG','ROP_-_FAST:1','BIT_WEIGHT','TOP_DRIVE_RPM','DEN','NPOR:1']]
VertB = VertB[['DEPT','DT35','GMGC','ROP_-_FAST:1','BIT_WEIGHT','ROTARY_RPM','DEN','NPRL:1']]
VertC = VertC[['DEPT','DTCO','GR','ROP_-_FAST:1','BIT_WEIGHT','ROTARY_RPM','RHOB','NPOR']]
VertD = VertD[['DEPT','MCDT','GMGC','ROP_-_FAST:1','BIT_WEIGHT','ROTARY_RPM','DEN','NPRL:1']]
VertE = VertE[['DEPT','DT','GRTO','ROP','WOB','RPM','RHOB','NPHI']]


VertA = VertA.rename(index=str,columns={"DT35":"DT","GMSG":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","TOP_DRIVE_RPM":"RPM","DEN":"RHOB","NPOR:1":"NPHI"})
VertB = VertB.rename(index=str,columns={"DT35":"DT","GMGC":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","ROTARY_RPM":"RPM","DEN":"RHOB","NPRL:1":"NPHI"})
VertC = VertC.rename(index=str,columns={"DTCO":"DT","GR":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","ROTARY_RPM":"RPM","RHOB":"RHOB","NPOR":"NPHI"})
VertD = VertD.rename(index=str,columns={"MCDT":"DT","GMGC":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","ROTARY_RPM":"RPM","DEN":"RHOB","NPRL:1":"NPHI"})
VertE = VertE.rename(index=str,columns={"GRTO":"GR"})

In [None]:
# Plot VertB DT35

x = VertB['DEPT']
y = VertB['DT']

p = figure(title='VertB DT', plot_width=800, plot_height=300)
r1 = p.line(x, y, color='green', line_width=0.25,legend='DT')
p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'top_left'
show(p)

<div class="alert alert-success">
    <b>EXERCISE 1</b>:
     <ul>
      <li>Insert new cells and plot gamma ray, porosity, and density for VertB
      <li>For Gamma Ray: GR 
      <li>For Porosity: NPHI
      <li>For Density: RHOB
      </li>
</div>

<div class="alert alert-success">
    <b>EXERCISE 1 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 1 SOLUTION END </b>
</div>

In [None]:
# Plot VertC DTCO

x = VertC['DEPT']
y = VertC['DT']

p = figure(title='VertC DTCO', plot_width=600, plot_height=300)
r1 = p.line(x, y, color='green', line_width=0.25,legend='DT')
p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'top_left'
show(p)

<div class="alert alert-success">
    <b>EXERCISE 2</b>:
     <ul>
      <li>Insert new cells and plot gamma ray, porosity, and density for VertC
      <li>For Gamma Ray: GR
      <li>For Porosity: NPHI
      <li>For Density: RHOB
      </li>
      </li>
    </ul>
</div>

<div class="alert alert-success">
    <b>EXERCISE 2 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 2 SOLUTION END </b>
</div>

Training and Testing Data

To evaluate our models we split our dataset into a training set and a test set. If you remember from the supervised learning example for apples and oranges we wanted to test how well our machine had learned to tell the difference by testing the machine with apples and oranges it hadn't previously seen.

For our example we are going to train a machine to predict a compressional wave sonic log using some drilling data and a gamma ray log. We will split VertB into a training and testing set and evaluate the performance of the model within the VertB dataset. We will then use this model to predict the compressional wave sonic log in VertC and evaluate our performance. The goal is a fair evaluation of the system.

![Screen%20Shot%202018-10-17%20at%202.40.04%20PM.png](attachment:Screen%20Shot%202018-10-17%20at%202.40.04%20PM.png)

In [None]:
# Let's begin by setting up our VertB and VertC subset datasets.

X_VertB = VertB[['ROP','WOB','RPM','GR']]
y_VertB = VertB['DT']

X_VertC = VertC[['ROP', 'WOB','RPM', 'GR']]
y_VertC = VertC['DT']

In [None]:
# First we will use VertB to predict VertC and then VertC to predict VertB.
# We will now divide out data into a training and test sets using a rather handy function.
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

from sklearn.model_selection import train_test_split

train_X_VertB, test_X_VertB, train_y_VertB, test_y_VertB = train_test_split(X_VertB, y_VertB, 
                                                    train_size=0.9,
                                                    test_size=0.1,
                                                    random_state=42)

train_X_VertC, test_X_VertC, train_y_VertC, test_y_VertC = train_test_split(X_VertC, y_VertC, 
                                                    train_size=0.9,
                                                    test_size=0.1,
                                                    random_state=42)

In [None]:
# We will use a neighbour based method for regression.

from sklearn.neighbors import KNeighborsRegressor

kneighbor_regression = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)

kneighbor_regression.fit(train_X_VertB, train_y_VertB)
VertB_test = kneighbor_regression.predict(test_X_VertB)

In [None]:
# Take a look at test results on a crossplot

x = test_y_VertB
y = VertB_test

p = figure(title='VertB DT Crossplot', plot_width=300, plot_height=300)
r1 = p.circle(x, y, color='black', line_width=0.25,legend='DT')
p.xaxis.axis_label = 'VertB Actual'
p.yaxis.axis_label = 'VertB Prediction'
p.legend.location = 'bottom_right'
show(p)

In [None]:
# Now we take the model we just trained and use it to predict DT for VertC
VertC_Pred_DT_KNR = kneighbor_regression.predict(X_VertC)

In [None]:
# Plot results of VertC prediction

x = VertC['DEPT']
y = VertC['DT']
z = VertC_Pred_DT_KNR

p = figure(title='VertC Predicted DTCO using KNR', plot_width=800, plot_height=300)
r = p.line(x, y, color='green', line_width=0.5,legend='DT')
r1 = p.line(x, z, color='blue', line_width=0.5,legend='Pred. DT')
p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'top_left'
show(p)

In [None]:
# Now train on VertC to predict VertB
kneighbor_regression.fit(train_X_VertC, train_y_VertC)

VertC_test = kneighbor_regression.predict(test_X_VertC)

In [None]:
# Take a look at test results on a crossplot

x = test_y_VertC
y = VertC_test

p = figure(title='VertC DT Crossplot', plot_width=300, plot_height=300)
r1 = p.circle(x, y, color='black', line_width=0.25,legend='DT')
p.xaxis.axis_label = 'VertC Actual'
p.yaxis.axis_label = 'VertC Prediction'
p.legend.location = 'bottom_right'
show(p)

In [None]:
VertB_Pred_DT_KNR = kneighbor_regression.predict(X_VertB)

In [None]:
x = VertB['DEPT']
y = VertB['DT']
z = VertB_Pred_DT_KNR

p = figure(title='VertB Predicted DT using KNR', plot_width=600, plot_height=300)
r = p.line(x, y, color='green', line_width=0.25,legend='DT')
r1 = p.line(x, z, color='blue', line_width=0.25,legend='Pred. DT')
p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'top_left'
show(p)

In [None]:
# We will now use mean absolute error to compare performance.
from sklearn.metrics import mean_absolute_error

error_VertB_model = mean_absolute_error(y_VertC, VertC_Pred_DT_KNR)
error_VertC_model = mean_absolute_error(y_VertB, VertB_Pred_DT_KNR)

In [None]:
# Lots of metrics are available.
from sklearn.metrics.scorer import SCORERS
print(SCORERS.keys())

In [None]:
y_VertC.mean()

In [None]:
error_VertB_model

In [None]:
y_VertB.mean()

In [None]:
error_VertC_model

In [None]:
# Can we make this prediction better? Let's try a different regression method in sklearn.
# http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

from sklearn.svm import SVR
svr = SVR(kernel='linear',C=1232, gamma=10)

In [None]:
svr

In [None]:
# SVR works much faster when data is normalized.

from sklearn.preprocessing import Normalizer

scaler = Normalizer().fit(VertB[['ROP','WOB','RPM','GR']])
normalized_VertB = scaler.transform(VertB[['ROP','WOB','RPM','GR']])

scaler = Normalizer().fit(VertC[['ROP','WOB','RPM','GR']])
normalized_VertC = scaler.transform(VertC[['ROP','WOB','RPM','GR']])

# Convert back to pandas dataframes
normalized_VertB = pd.DataFrame(normalized_VertB,columns=[['ROP','WOB','RPM','GR']])
normalized_VertC = pd.DataFrame(normalized_VertC,columns=[['ROP','WOB','RPM','GR']])

In [None]:
# Split the normalized data into training and test sets

train_X_VertB, test_X_VertB, train_y_VertB, test_y_VertB = train_test_split(normalized_VertB, y_VertB, 
                                                    train_size=0.9,
                                                    test_size=0.1,
                                                    random_state=42)

train_X_VertC, test_X_VertC, train_y_VertC, test_y_VertC = train_test_split(normalized_VertC, y_VertC, 
                                                    train_size=0.9,
                                                    test_size=0.1,
                                                    random_state=42)

In [None]:
# Fit VertB and predict VertC
VertBtoVertC_DT = svr.fit(train_X_VertB, train_y_VertB)
VertC_Pred_DT = VertBtoVertC_DT.predict(normalized_VertC)

In [None]:
x = VertC['DEPT']
y = VertC['DT']
z = VertC_Pred_DT
z1 = VertC_Pred_DT_KNR

p = figure(title='VertC Predicted DT using SVR', plot_width=800, plot_height=500)
r = p.line(x, y, color='green', line_width=0.5,legend='DT')
r1 = p.line(x, z, color='blue', line_width=0.5,legend='Pred. DT SVR')
r2 = p.line(x, z1, color='red', line_width=0.5,legend='Pred. DT KNR')

p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'top_left'
show(p)

In [None]:
# Point of interest. You can view how important each variable is to the prediction with a linear kernel in SVR.
importances = svr.coef_.flatten()
importances

'ROP','WOB','RPM','GR'

In [None]:
error_VertB_model = mean_absolute_error(y_VertC, VertC_Pred_DT)
error_VertB_model

In [None]:
# Fit VertC and predict VertB
VertCtoVertB_DT = svr.fit(train_X_VertC, train_y_VertC)
VertB_Pred_DT = VertCtoVertB_DT.predict(normalized_VertB)

In [None]:
x = VertB['DEPT']
y = VertB['DT']
z = VertB_Pred_DT
z1 = VertB_Pred_DT_KNR

p = figure(title='VertB Predicted DT using SVR', plot_width=600, plot_height=300)
r = p.line(x, y, color='green', line_width=0.5,legend='DT')
r1 = p.line(x, z, color='blue', line_width=0.5,legend='Pred. DT SVR')
r2 = p.line(x, z1, color='red', line_width=0.5,legend='Pred. DT KNR')

p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'top_left'
show(p)

In [None]:
error_VertC_model = mean_absolute_error(y_VertB, VertB_Pred_DT)
error_VertC_model

<div class="alert alert-success">
    <b>EXERCISE 3</b>:
     <ul>
      <li>See if you can predict the density logs in a similar manner.   
      <li>Prepare your data.
      <li>Create your model using SVR or KNR.
      <li>Fit the model on 1st well.
      <li>Predict the model on 2nd well.
      <li>Plot the predicted vs. actual results.
      </li>
    <ul>
   

<div class="alert alert-success">
    <b>EXERCISE 3 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 3 SOLUTION END </b>
</div>

We are now going to start to tune our model. There isn't really a general rule for finding the sweet spot. We will start by ranking our features based on what the model determines is important. We will then perform a grid step search looking for the right settings to get the 'best' model performance. 

For recursive feature elimination we need to add back all the overlapping data between VertB and VertC.

In [None]:
print(VertB.columns.tolist())

In [None]:
print(VertC.columns.tolist())

In [None]:
X_VertB = VertB[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']]
y_VertB = VertB['DT']

X_VertC = VertC[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']]
y_VertC = VertC['DT']

In [None]:
# SVR works much faster when data is normalized.

scaler = Normalizer().fit(VertB[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']])
normalized_VertB = scaler.transform(VertB[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']])

scaler = Normalizer().fit(VertC[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']])
normalized_VertC = scaler.transform(VertC[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']])

# Convert back to pandas dataframes
normalized_VertB = pd.DataFrame(normalized_VertB,columns=[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']])
normalized_VertC = pd.DataFrame(normalized_VertC,columns=[['GR', 'ROP', 'WOB', 'RPM', 'RHOB', 'NPHI']])

The recursive feature elimination (RFE) module selects subsets of features. The estimator is trained on all the features in question and an importance ranking is obtained. The least important feature is dropped recursively until the specified number of features is obtained.

In [None]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
# We are arbtrarily asking the machine to select the four best features for this model.

from sklearn.feature_selection import RFE

select = RFE(svr,n_features_to_select=4)

In [None]:
select.fit(normalized_VertB, y_VertB)
normalized_VertB_FS = select.transform(normalized_VertB)
print(normalized_VertB.shape)
print(normalized_VertB_FS.shape)

In [None]:
#RFE.get_support?

In [None]:
mask = select.get_support()
mask

In [None]:
# Plot the mask. Black is True, White is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')

In [None]:
normalized_VertB.columns.tolist()

In [None]:
normalized_VertB_FS = pd.DataFrame(normalized_VertB_FS,columns=[['ROP', 'RPM', 'RHOB', 'NPHI']])

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold

cv = KFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 1232], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(SVR(), param_grid=param_grid, cv=cv, verbose=1, return_train_score=True, n_jobs=-1)

In [None]:
#GridSearchCV?

In [None]:
grid.fit(normalized_VertB_FS, y_VertB)

In [None]:
# We can view more detailed results using the cv_results_ variable
#cv_results = pd.DataFrame(grid.cv_results_)
#cv_results.head(48)

In [None]:
print(grid.best_score_)

In [None]:
print(grid.best_params_)

In [None]:
# Take the hyperparameter tuned model with RFE to predict DT for VertC.
hypertune_SVR = grid.predict(normalized_VertC[['ROP', 'RPM', 'RHOB', 'NPHI']])

In [None]:
x = VertC['DEPT']
y = VertC['DT']
z = VertC_Pred_DT
z1 = VertC_Pred_DT_KNR
z2 = hypertune_SVR

p = figure(title='VertC Predicted DT using SVR', plot_width=600, plot_height=400)
r = p.line(x, y, color='green', line_width=0.5,legend='DT')
r1 = p.line(x, z, color='blue', line_width=0.5,legend='Pred. DT SVR')
r2 = p.line(x, z1, color='red', line_width=0.5,legend='Pred. DT KNR')
r3 = p.line(x, z2, color='orange', line_width=0.5,legend='hypertune_SVR')

p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'Delta Time (ms/ft)'
p.legend.location = 'bottom_left'
show(p)

In [None]:
# Calculate MAE so we can compare model performance before and after tuning.
error_VertB_hypertune_model = mean_absolute_error(y_VertC, hypertune_SVR)
print(error_VertB_hypertune_model)
print(error_VertB_model)

<div class="alert alert-success">
    <b>EXERCISE 4</b>:
     <ul>
      <li>See if you can predict the density and porosity logs in a similar manner (RHOB, NPHI). 
      <li>Prepare your data. 
      <li>Create your model using SVR, KNR, etc. 
      <li>Fit the model on 1st well. 
      <li>Predict the model on 2nd, 3rd, etc. 
      <li>Plot the predicted vs. actual results. 
      <li>Perform RFE and hyperparameter tuning. Plot all results. 
      </li>
    </ul>
</div>

<div class="alert alert-success">
    <b>EXERCISE 4 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 4 SOLUTION END </b>
</div>

In [None]:
# How does more data affect the result?
# Let's concatenate A,B,C,D and predict E for DT 

In [None]:
VertA = pd.read_excel('VertA_merge.xlsx')
VertB = pd.read_excel('VertB_merge.xlsx')
VertC = pd.read_excel('VertC_merge.xlsx')
VertD = pd.read_excel('VertD_merge.xlsx')
VertE = pd.read_excel('VertE_merge.xlsx')

In [None]:
# Correct porosity units NPRL:1 = NPOR = NPOR:1
# These values need to be made the same.
VertA['NPOR:1'] = VertA['NPOR:1']/100
VertB['NPRL:1'] = VertB['NPRL:1']/100
VertD['NPRL:1'] = VertD['NPRL:1']/100

In [None]:
# Merge A, B, C, D, and predict DT for E.
# And standardize column names.
# My RPM column for VertA is all zeros. 
# We might be able to use 'TOP_DRIVE_RPM' as a substitute.

VertA_subset = VertA[['DEPT','DT35','GMSG','ROP_-_FAST:1','BIT_WEIGHT','TOP_DRIVE_RPM','DEN','NPOR:1']]
VertB_subset = VertB[['DEPT','DT35','GMGC','ROP_-_FAST:1','BIT_WEIGHT','ROTARY_RPM','DEN','NPRL:1']]
VertC_subset = VertC[['DEPT','DTCO','GR','ROP_-_FAST:1','BIT_WEIGHT','ROTARY_RPM','RHOB','NPOR']]
VertD_subset = VertD[['DEPT','MCDT','GMGC','ROP_-_FAST:1','BIT_WEIGHT','ROTARY_RPM','DEN','NPRL:1']]
VertE_subset = VertE[['DEPT','DT','GRTO','ROP','WOB','RPM','RHOB','NPHI']]


VertA_subset = VertA_subset.rename(index=str,columns={"DT35":"DT","GMSG":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","TOP_DRIVE_RPM":"RPM","DEN":"RHOB","NPOR:1":"NPHI"})
VertB_subset = VertB_subset.rename(index=str,columns={"DT35":"DT","GMGC":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","ROTARY_RPM":"RPM","DEN":"RHOB","NPRL:1":"NPHI"})
VertC_subset = VertC_subset.rename(index=str,columns={"DTCO":"DT","GR":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","ROTARY_RPM":"RPM","RHOB":"RHOB","NPOR":"NPHI"})
VertD_subset = VertD_subset.rename(index=str,columns={"MCDT":"DT","GMGC":"GR","ROP_-_FAST:1":"ROP","BIT_WEIGHT":"WOB","ROTARY_RPM":"RPM","DEN":"RHOB","NPRL:1":"NPHI"})
VertE_subset = VertE_subset.rename(index=str,columns={"GRTO":"GR"})


frames = [VertA_subset,VertB_subset,VertC_subset,VertD_subset]
mergeABCD = pd.concat(frames,ignore_index=True)

In [None]:
mergeABCD.shape

In [None]:
# Setup our X and y matrices
X_VertABCD = mergeABCD[['ROP','WOB','RPM','GR','RHOB','NPHI']]
y_VertABCD = mergeABCD['DT']

X_VertE = VertE_subset[['ROP','WOB','RPM','GR','RHOB','NPHI']]
y_VertE = VertE_subset['DT']

In [None]:
# SVR works much faster when data is normalized.

scaler = Normalizer().fit(X_VertABCD[['ROP','WOB','RPM','GR','RHOB','NPHI']])
normalized_mergeABCD_X = scaler.transform(X_VertABCD[['ROP','WOB','RPM','GR','RHOB','NPHI']])

scaler = Normalizer().fit(VertE_subset[['ROP','WOB','RPM','GR','RHOB','NPHI']])
normalized_VertE_subset_X = scaler.transform(VertE_subset[['ROP','WOB','RPM','GR','RHOB','NPHI']])

# Convert back to pandas dataframes
normalized_mergeABCD_X = pd.DataFrame(normalized_mergeABCD_X,columns=[['ROP','WOB','RPM','GR','RHOB','NPHI']])
normalized_VertE_subset_X = pd.DataFrame(normalized_VertE_subset_X,columns=[['ROP','WOB','RPM','GR','RHOB','NPHI']])

In [None]:
select = RFE(svr,n_features_to_select=4)

In [None]:
select.fit(normalized_mergeABCD_X, y_VertABCD)
normalized_mergeABCD_X_FS = select.transform(normalized_mergeABCD_X)
print(normalized_mergeABCD_X.shape)
print(normalized_mergeABCD_X_FS.shape)

In [None]:
mask = select.get_support()
mask

In [None]:
# Plot the mask. Black is True, White is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')

In [None]:
normalized_mergeABCD_X.columns.tolist()

In [None]:
normalized_mergeABCD_X_FS = pd.DataFrame(normalized_mergeABCD_X_FS,columns=[['RPM','GR','RHOB','NPHI']])

In [None]:
# Perform cross-validation and hyperparameter tune.
cv = KFold(n_splits=5, random_state=42, shuffle=True)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 1232], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(SVR(), param_grid=param_grid, cv=cv, verbose=1, return_train_score=True, n_jobs=-1)

grid.fit(normalized_mergeABCD_X_FS, y_VertABCD)

In [None]:
print(grid.best_score_)

In [None]:
print(grid.best_params_)

In [None]:
# Take the hyperparameter tuned model with RFE to predict DT for VertE.
hypertune_SVR = grid.predict(normalized_VertE_subset_X[['RPM','GR','RHOB','NPHI']])

In [None]:
x = VertE['DEPT']
y = VertE['DT']
z = hypertune_SVR

p = figure(title='VertE Predicted DT using SVR', plot_width=800, plot_height=300)
r = p.line(x, y, color='green', line_width=0.5,legend_label='DT')
r1 = p.line(x, z, color='blue', line_width=0.5,legend_label='Pred. DT SVR')

p.xaxis.axis_label = 'Depth (ft)'
p.yaxis.axis_label = 'DT (ms/ft)'
p.legend.location = 'top_left'
show(p)

<div class="alert alert-success">
    <b>EXERCISE 5</b>:
     <ul>
      <li>Score Pred. DT SVR using mean absolute error.   
</div>

<div class="alert alert-success">
    <b>EXERCISE 5 SOLUTION START </b>
</div>

In [None]:
error = mean_absolute_error(y,z)

<div class="alert alert-success">
    <b>EXERCISE 5 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 6</b>:
     <ul>
      <li> 
       1. Concatenate A,B,C,D and predict E for RHOB
       <li>
       2. Setup X and y matrices.
       <li>
       3. Normalize data and convert back to dataframe.
       <li>
       4. Perform RFE.
       <li>
       5. Perform hyperparameter tuning.
       <li>
       6. Predict VertE with RFE tuned model.
       <li>
       7. Plot VertE RHOB vs. prediction.
    
       
</div>

<div class="alert alert-success">
    <b>EXERCISE 6 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 6 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 7</b>:
     <ul>
      <li> 
       1. Concatenate Matrices and predict E for PE=PDPE.
       Hint: Locate all the wells that have PDPE data.
       <li>
       2. Setup X and y matrices.
       <li>
       3. Normalize data and convert back to dataframe.
       <li>
       4. Perform RFE.
       <li>
       5. Perform hyperparameter tuning.
       <li>
       6. Predict VertE with RFE tuned model.
       <li>
       7. Plot VertE PE vs. prediction.    
<div>

<div class="alert alert-success">
    <b>EXERCISE 7 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 7 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 8</b>:
     <ul>
      <li> 
       1. Cut VertC log in roughly half.
       <li>
       2. Setup X and y matrices to predict DTSM.
       <li>
       3. Normalize data and convert back to dataframe.
       <li>
       4. Perform RFE.
       <li>
       5. Perform hyperparameter tuning.
       <li>
       6. Predict remainder of VertC with RFE tuned model.
       <li>
       7. Plot VertC DTSM vs. prediction.    
<div>

<div class="alert alert-success">
    <b>EXERCISE 8 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 8 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 9</b>:
     <ul>
      <li> 
       Lets pretend to be geologists unless you are a real one (separate assignment).
       <li>
       1. Cut out the section of every well between 5000 and 6000 feet.
       <li>
       2. Concatenate 4 of the wells and use them to predict DT for the 5th well.
       <li>
       3. Hyperparameter tune.
          
<div>

<div class="alert alert-success">
    <b>EXERCISE 9 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 9 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 10</b>:
     <ul>
      <li> 
       Lets pretend to be geologists unless you are a real one (separate assignment).
       <li>
       1. Cut out the section of every well between 5000 and 6000 feet.
       <li>
       2. Concatenate 4 of the wells and use them to predict RHOB for the 5th well.
       <li>
       3. Hyperparameter tune.    
<div>

<div class="alert alert-success">
    <b>EXERCISE 10 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 10 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 11</b>:
     <ul>
      <li> 
       Lets pretend to be geologists unless you are a real one (separate assignment).
       <li>
       1. Cut out the section of every well between 5000 and 6000 feet.
       <li>
       2. Concatenate 4 of the wells and use them to predict NPHI for the 5th well.
       <li>
       3. Hyperparameter tune.
           
<div>

<div class="alert alert-success">
    <b>EXERCISE 11 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 11 SOLUTION END </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 12</b>:
     <ul>
      <li> 
       Take a look at decision tree regression https://scikit-learn.org/stable/modules/tree.html#regression
       <li>
           1. Concatenate A,B,C,D and predict E for DT
       <li>
       2. Setup X and y matrices.
       <li>
       3. Perform RFE.
       <li>
       4. Perform hyperparameter tuning.
       <li>
       5. Predict VertE with RFE tuned model.
       <li>
       6. Plot VertE DT actual vs. prediction.
<div>

<div class="alert alert-success">
    <b>EXERCISE 12 SOLUTION START </b>
</div>

<div class="alert alert-success">
    <b>EXERCISE 12 SOLUTION END </b>
</div>