In [None]:
import tensorflow as tf
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import random
import numpy as np
import os


In [None]:
from typing import List
from typing import Union
from typing import Tuple

SHIFT_STEP = 3
NB_SAMPLES = SHIFT_STEP * 5
WINDOW_LENGTH = 30
DILATION_RATIO = 1.5

def get_well_relevant_windows(top_index: int, df_well: pd.DataFrame, nb_samples: int=NB_SAMPLES,
                              shift: int=SHIFT_STEP, ratio: Union[None, float]=None) -> Tuple[List[np.ndarray], List[np.ndarray]]:
    """
    Given df_well : 'wellName', 'DEPTH', 'GR' and top_index the position of a top in df_well
    Returns a list of numerous windows around top_index, and their label
    Labels are either True or False
    for a given selected window it is labelled True if the distance between its center and
    the top position is less than 4

    :param top_index: int
    :param df_well: pd.DataFrame(columns=['wellName', 'DEPTH', 'GR'])
    :param nb_samples: int
    :param shift: int=SHIFT_STEP
    :param ratio: Union[None, float]=None
    :return: list
    """
    windows = []
    labels = []
    positives = 0
    negatives = 0
    for i in range(top_index - nb_samples, top_index + nb_samples, shift):
        left_limit = i - WINDOW_LENGTH
        right_limit = i + WINDOW_LENGTH
        window_data = list(map(lambda x: np.array([x]), list(df_well['GR'].values[left_limit:right_limit + 1])))
        if np.array(window_data).shape != (WINDOW_LENGTH * 2 + 1, 1):
            continue
        label = abs(df_well['DEPTH'].iloc[i] - df_well['DEPTH'].iloc[top_index]) < 4
        if ratio:
            if label:
                windows.append(np.array(window_data))
                labels.append(np.array(label))
            elif negatives / max(positives + negatives, 1) < ratio:
                pass
            else:
                windows.append(np.array(window_data))
                labels.append(np.array(label))
        else:
            windows.append(np.array(window_data))
            labels.append(np.array(label))

        if label:
            positives += 1
        else:
            negatives += 1

    return windows, labels


# TODO: check index and len depth

def generate_top_dataset(df_logs: pd.DataFrame, df_tops: pd.DataFrame,
                         top: str='CONRAD', ratio: Union[None, float]=None):
    """
    From df_logs and df_tops for each well
    return a list of relevant windows from the whole signal of the well and the labels of the windows
    a relevant window depend on the top
    for more explanation about window selection cf get_well_relevant_windows

    df_logs contains : 'wellName', 'DEPTH', 'GR'
    df_tops contains a column top

    :param df_logs: pd.DataFrame
    :param df_tops: pd.DataFrame
    :param top: str='CONRAD'
    :param ratio: Union[None, float]=None
    :return:
    """
    windows = []
    labels = []
    for well_name in df_logs['wellName'].drop_duplicates().tolist():
        df_well = df_logs[df_logs['wellName'] == well_name]
        top_position = df_tops.loc[well_name][top]
        if np.isnan(top_position):
            print("NAN FOUND")
            continue
        depth_list = list(df_well['DEPTH'].values)
        real_top_position = min(df_well['DEPTH'].values,
                                key=lambda x: abs(x - top_position))  # SOMETIMES top_position not in df_logs
        if abs(real_top_position - top_position) > 3:
            print("DATA BAD LABELLED")
            continue
        top_index = depth_list.index(real_top_position)
        windows_, labels_ = get_well_relevant_windows(top_index=top_index, df_well=df_well, shift=1, nb_samples=1 * 100,
                                                      ratio=ratio)
        windows += windows_
        labels += labels_
    return windows, labels



class TopFinder:
    """
    TopFinder: wrapper for window classifier
    
    Limitations:
    - Work on single one top and assume independence among tops
    - Find top by classifying windows extracted from well data and discard
      the correlation between windows
    - Does not utilize geographical info of wells

    Usage example:

        >>> model.fit(dataset)
        >>> model.evaluate_windows = a_func

        >>> top_finder = TopFinder(model, top_name)
        >>> top_finder.examine_dataset(df_tops)

        >>> predicted_depth = top_finder.find_top(df_well)

    """

    def __init__(self, fitted_window_classifier, top_name):
        if fitted_window_classifier.evaluate_windows is None:
            raise ValueError("fitted_window_classifier has to have function evaluate_windows")
        self.window_classifier = fitted_window_classifier
        self.work_on_top = top_name
        self.stats = {}

    def examine_dataset(self, df_tops:pd.DataFrame):
        self.stats['top_depth_max'] = df_tops[self.work_on_top].max()
        self.stats['top_depth_min'] = df_tops[self.work_on_top].min()

    def extract_window(self, df_well:pd.DataFrame, center_idx, window_length):
        left_limit = center_idx - window_length
        right_limit = center_idx + window_length
        window = df_well.loc[left_limit : right_limit, 'GR'].to_numpy()
        return window

    def get_candidate_windows(self, df_well:pd.DataFrame):
        '''
        extra prior knowledge may be used to narrow down the scope of candidates, 
        e.g. top distribution. 

        return list of windows. Each window includes the depth of its center & GR data.
        '''
        max_, min_ = self.stats['top_depth_max'], self.stats['top_depth_min']
        center_  = (max_ + min_) / 2
        depth_diff_ = max_ - min_
        dilated_max_ =  center_ + DILATION_RATIO * depth_diff_ / 2
        dilated_min_ =  center_ - DILATION_RATIO * depth_diff_ / 2

        windows = []
        for idx, row in df_well.iterrows():
            if row['DEPTH'] < dilated_max_ and row['DEPTH'] > dilated_min_:
                window_depth = row['DEPTH']
                window_data = self.extract_window(df_well, idx, WINDOW_LENGTH)
                if window_data.shape != (WINDOW_LENGTH * 2 + 1,):
                    # print(window_data.shape) 
                    # It happens when the window gets out of the scope of well depth
                    continue
                windows.append((window_depth, window_data))
        return windows

    def select_window(self, windows, scores: np.array):
        '''
        extra prior knowledge may be used here, e.g. top relationships
        '''
        index_max = np.argmax(scores, axis=0)
        return windows[index_max]

    def find_top(self, df_well):
        """
        Step:
            1. Extract all candidate windows from the well
            2. Evalute each candidate by window classifier
            3. Select the best candidate
            4. Return its associated depth
        """
        if self.window_classifier is None:
            raise Exception("window_classifier is not set")
        if df_well.shape[0] == 0:
            raise Exception("input well has no data")

        self.windows = self.get_candidate_windows(df_well)
        print(f'{len(self.windows)} candidate windows')
        windows_data = np.array([w[1] for w in self.windows])
        self.scores = self.window_classifier.evaluate_windows(windows_data)
        selected_window = self.select_window(self.windows, self.scores)
        self.top_depth = selected_window[0]

        return self.top_depth



def get_true_windows(df_logs, df_tops, top_, keep_depth = False):
    dataset = generate_top_dataset(df_logs=df_logs, df_tops=df_tops, top=top_)
    all_well_names = df_logs['wellName'].unique()
    print(f'{len(dataset[0])} windows extracted from {len(all_well_names)} wells')

    X = np.array(dataset[0]).squeeze(axis=2)
    y = np.array(dataset[1])
    
    print('X:', X.shape)
    print('y:', y.shape)

    true_idx = [idx for idx in range(len(X)) if y[idx] == True]
    print(f'{len(true_idx)} true windows left')

    return X[true_idx]

def get_true_depth(wellname, top, df_tops):
    return df_tops.loc[wellname, top]


def visual_scores(depths, scores, max_score_depth=None, true_depth=None, well_name=None):
    data = []
    data.append(go.Scatter(x=depths,y=scores))
    title = "Evaluation Score w.r.t depth"
    if well_name:
        title += f' [well: {well_name}]'
    fig = go.Figure(data=data, layout={'title':title})
    if max_score_depth:
        fig.add_vline(x=max_score_depth, line_width=2, line_color="yellow", \
            annotation_text='Predicated', annotation_position='top left')
    if true_depth:
        fig.add_vline(x=true_depth, line_width=2, line_color="green", \
            annotation_text='True', annotation_position='top right')
    return fig



Uploading The datasets

In [None]:

#uncomment the top to work with one top at a time

#top = 'CONRAD' 
#top = 'SYLVAIN'
top = 'MARCEL'
df_logs = pd.read_parquet("../data/logs.parquet")
df_loc = pd.read_parquet("../data/loc.parquet")
df_tops = pd.read_parquet("../data/tops.parquet")

df_logs_test = pd.read_parquet("../testdata/logs_50.parquet")
df_loc_test = pd.read_parquet("../testdata/loc_50.parquet")
df_tops_test = pd.read_csv("../testdata/tops_50.csv", index_col=0)

In [None]:
train_dataset = generate_top_dataset(df_logs= df_logs, df_tops=df_tops, top=top)
test_dataset = generate_top_dataset(df_logs=df_logs_test, df_tops=df_tops_test, top=top)

Data Pipeline

In [None]:
X = np.array(train_dataset[0]).squeeze(axis=2)
y = np.array(train_dataset[1])

X_test= np.array(test_dataset[0]).squeeze(axis=2)
y_test = np.array(test_dataset[1])

In [None]:
print("Training set: ", X.shape,y.shape)
print("Testing set: ", X_test.shape,y_test.shape)

Time series to Image Transformation

To work with CNN the time series need to be changed data grid-like format, therefore the data will be changed to image format of GASF,GADF and MTF

In [None]:
# a method to change timeseries to either Gramian Angular Summation Field (gasf) or Gramian Angular Difference Field (gadf)

def gaf(X,method):
    from pyts.image import GramianAngularField
    transformer = GramianAngularField(method=method)
    X_new = transformer.transform(X)
    X_new = np.expand_dims(X_new,axis=3)
    return X_new

In [None]:
# a method to change timeseries to Markov Transition Field (MTF)

def mtf(X):
    from pyts.image import MarkovTransitionField
    transformer = MarkovTransitionField()
    X_new = transformer.transform(X)
    X_new = np.expand_dims(X_new,axis=3)
    return X_new

In [None]:
# a method to change timeseries to the combined format of gasf,gadf and mtf

def combined(X):
    X_gasf=gaf(X,method='summation')
    X_gadf=gaf(X,method='difference')
    X_mtf= mtf(X)
    X_new=np.concatenate((X_gasf,X_gadf,X_mtf),axis=3)
    return X_new

Caling function for changing the time series to image
Make sure to choose one image format for both training and testing dataset

In [None]:
#call one funcion at a time
#X_new= mtf(X)
#X_new= gaf(X,method='summation') #for gasf
X_new= gaf(X,method='difference') #for gadf
#X_new= combined(X)
X_new.shape

In [None]:
#X_test= mtf(X_test)
#X_test= gaf(X_test,method='summation') #for gasf
X_test= gaf(X_test,method='difference') #for gadf
#X_test= combined(X_test)
X_test.shape

Visualization

In [None]:
#visualize the tranformed timeseries of the first well in a dataset
f, axs = plt.subplots(1,2,figsize=(10,10))
plt.subplot(2,2,1)
plt.title('Timeseries')
plt.plot(X[0])
plt.subplot(2,2,2)
plt.title('Image Transformed')
plt.imshow(X_new[0])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=0.33, random_state = 0)

In [None]:
print('Training Sets',X_train.shape, y_train.shape)
print('Validation Sets',X_val.shape, y_val.shape)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

In [None]:
#Model training 
model = Sequential()
#add model layers
model.add(Conv2D(64, kernel_size=3, activation='sigmoid', input_shape=X_train.shape[1:]))
model.add(Dropout(0.2))
model.add(Conv2D(32, kernel_size=3, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))

#compile model using accuracy to measure model performance
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
callback = EarlyStopping(monitor='val_loss',patience=3, restore_best_weights=True)
print(model.summary())

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, callbacks=callback)

#load mtf for conrad
#model= load_model('model/conrad/mtf/model.h5')

#model.save_weights('model/conrad/mtf/weights.h5')

Save the model for future use!
Each model is for a specific top and a specific image format, therefore it will be 3X4=12 saved models, meaning 3 tops and 4 images format

In [None]:
#Uncomment one of the below lines of code to save the model based on the choosen image format

#FOR CONRAD
#model.save('model/conrad/mtf/model.h5')
#model.save('model/conrad/gasf/model.h5')
#model.save('model/conrad/gadf/model.h5')
#model.save('model/conrad/combined/model.h5')

#FOR MARCEL
#model.save('model/marcel/mtf/model.h5')
#model.save('model/marcel/gasf/model.h5')
#model.save('model/marcel/gadf/model.h5')
#model.save('model/marcel/combined/model.h5')

#FOR SYLVAIN
#model.save('model/sylvain/mtf/model.h5')
#model.save('model/sylvain/gasf/model.h5')
#model.save('model/sylvain/gadf/model.h5')
#model.save('model/sylvain/combined/model.h5')


Prediction Results Summmary

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,cohen_kappa_score, roc_auc_score

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [None]:

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %f' % accuracy)

# precision tp / (tp + fp)
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)

# recall: tp / (tp + fn)
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)

# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

Classification Report

In [None]:

print('Prediction Accuracy Score {:.2f}.'.format(accuracy_score(y_test,y_pred)))
clax_report = classification_report(y_test,y_pred,output_dict=True)
report_df = pd.DataFrame(clax_report).transpose()
report_df

print('Training Process Completed')

TESTING START HERE!
In the below code of lines is for testing the how to extract a specific depth where the top is found!

In [None]:
#from hacktops.model import TopFinder
#from hacktops.utils import get_true_windows
#from hacktops.settings import WINDOW_LENGTH
from plotly.offline import iplot
from tqdm import tqdm
#from hacktops.utils import get_true_depth, visual_scores

In [None]:
def evaluate_windows(self, candidate_windows):
    #well= mtf(candidate_windows)
    #well= gaf(candidate_windows,method='summation')
    well= gaf(candidate_windows,method='difference')
    #well= combined(candidate_windows)
    return self.predict(well)[:,1]
import types
model.evaluate_windows = types.MethodType(evaluate_windows, model)

In [None]:
top_finder = TopFinder(model, top)
top_finder.examine_dataset(df_tops) #what is this top

In [None]:
test_well_names = df_logs_test['wellName'].unique()
print(len(test_well_names))

In [None]:
result = []
for test_well_name in tqdm(test_well_names):
    # print(f'well: {test_well_name}')
    df_test_well = df_logs_test[df_logs_test['wellName'] == test_well_name]
    predicted_depth = top_finder.find_top(df_test_well)
    true_depth = get_true_depth(test_well_name, top, df_tops_test)
    result.append([test_well_name, predicted_depth, true_depth])

In [None]:
df_result = pd.DataFrame(result, columns=['wellName', 'predicated_depth', 'true_depth']).set_index('wellName')
df_tops_pred = df_result[['predicated_depth']].rename(columns={'predicated_depth': top})
df_tops_true = df_result[['true_depth']].rename(columns={'true_depth': top})

In [None]:
from hacktops.evaluate import recall_tops

recall, mae, df_res = recall_tops(df_tops_true, df_tops_pred, tolerance = 4)
print("recall {0}, mae {1}".format(recall,mae))
df_res.head(50)

Visualizing a well

In [None]:
depth = [w[0] for w in top_finder.windows]
fig = visual_scores(depth, top_finder.scores, top_finder.top_depth, true_depth, test_well_name)
iplot(fig)