# Marker detection

## Data loading

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
from report.data_exploration import *
%load_ext autoreload
%autoreload 2

In [None]:
df_logs, df_loc, df_tops, well_names = get_data()

with open("markers_training.pkl", 'rb') as f:
    dict_markers = pickle.load(f)
print(", ".join(dict_markers.keys()))

### Meta data
###### :question: Visualize the data in a interactive map where you can locate each well
---------------

In [None]:
locate_wells(df_loc)

## Data visualization

### Channels

###### :question: Visualize the data: intensity (histogram, range), time series (spikes? noise?)
###### :question: What do you observe? 
---------------
#####  :mag: Observations
- A few spikes (negative number) which we clamp to zero to avoid visualization issues - Probably measurement outliers. Could be set to `np.NaN` or masked. 
- Most of the "interesting events" happen at high depths.
- Visually, it seems like the shallow signal is mostly noise (<6km). 
- The noisy shallow area could be a good area of interest to calibrate a denoising system

In [None]:
plot_well(df_logs, well_names[:20])


###### :question: Do we have constant depth sampling rate? what should you do if not?
---------------

#####  :mag:  Depth sampling rate
- We have access to a constant sampling rate (period=0.5m).
- If the depth sampling rate was not constant accross , we could resample (use `np.interp`) but special care shall be taken. If we had a mix of low and high frequencies, it's more complicated (either you resample everything to the lowest frequency + take special care of aliasing.. but you may loose information. you could also try to interpolate (~ oversample) the lowest frequencies but the high frequencies may simply not be faithful. 


In [None]:
all_lengths = [len(depths) for depths in dict_markers["depths"]]
min_lengths = min(all_lengths)
min_lengths

depths = dict_markers["depths"]
for idx in range(depths.shape[0]):
    all_depths = depths[idx]
    depth_step = all_depths[1:]- all_depths[:-1]
    plt.plot(depth_step, "o-", alpha=0.5)
plt.ylim(0., 1.)
plt.grid()
plt.show()


### Annotations

###### :question:  How many classes do you have?
---------------


##### :mag: Depth of markers exploration
- 3 classes (Marcel - Sylvain - Conrad ... ordered by increasing depth) for each well. 
- Seems like we observe the same depth histogram basically 3 times with an offset.
- If there is a relationship between the 3 depths, we should be able to see it on the 2 histograms of differences 





In [None]:
top_depths = np.array(dict_markers["top_depths"])
plt.figure(figsize=(5,5))
for top_idx, top in enumerate(dict_markers["top_names"]):
    plt.hist(top_depths[:,top_idx], bins=100, label=top)
plt.grid()
plt.legend()
plt.title("Histogram of events depths")
plt.show()

###### :question: Is the dataset balanced? do we have 3 markers per well?
---------------
#####  :mag:  Weakly labeled data
- Sometimes labels are missing (`depth=NaN`)
- We can plot the histograms of each class to check dataset balance.

In [None]:
counts = (~np.isnan(top_depths)).sum(axis=0)
print(counts)
plt.figure(figsize=(5,5))
plt.bar(dict_markers["top_names"], counts, color = ['lightblue', 'orange', 'green'])
plt.title(f"Number of labelled marker per type of marker {counts}")
plt.grid()
plt.show()


###### :question:  Do we have always the same order? 
---------------

##### :mag: Relative depth
- Let's plot the histogram of differences of (=relative) between the events depths
- Looks like if we find one of the events, we almost know where the search the other ones from the distribution.

In [None]:
plt.figure(figsize=(5,5))
plt.hist(top_depths[:,1]-top_depths[:,0], bins=100, label="CONRAD - MARCEL")
plt.hist(top_depths[:,2]-top_depths[:,1], bins=100, label="SYLVAIN - CONRAD")
plt.grid()
plt.legend()
plt.title("Histogram of relative depths between events")
plt.show()

###### :question:  Display the patterns corresponding to each class, on a same graph. Do you see any specific pattern? How much variation (shape, amplitude)?
---------------

#### :mag: Patterns

In [None]:
top_indexes = np.array(dict_markers["top_index"]) #.astype(int)
neighborhood = 100
signal = dict_markers["logs"]
for top_type_index, top_type in enumerate(dict_markers["top_names"]):
    for well_index in range(30):
        idx_top = top_indexes[well_index][top_type_index]
        if np.isnan(idx_top):
            continue
        idx_top = int(idx_top)
        plt.plot(signal[well_index][idx_top-neighborhood: idx_top+neighborhood].clip(0,None), "-", alpha=0.5)
    plt.title(top_type)
    plt.grid()
    plt.show()

Let's normalize the patterns (whiten) and look at various neighborhoods.

In [None]:
top_indexes = np.array(dict_markers["top_index"]) #.astype(int)
for neighborhood in [50, 100, 200]:
    total_signals  = 20
    signal = dict_markers["logs"]
    depths = dict_markers["depths"]
    plt.figure(figsize=(15,5))
    for top_type_index, top_type in enumerate(dict_markers["top_names"]):
        plt.subplot(1,3,top_type_index+1)
        for well_index in range(total_signals):
            idx_top = top_indexes[well_index][top_type_index]
            if np.isnan(idx_top):
                continue
            idx_top = int(idx_top)
            sig = signal[well_index][idx_top-neighborhood: idx_top+neighborhood].clip(0,None)
            sig = (sig - sig.mean())/(sig.std()+1e-10)
            extracted_depths = depths[well_index][idx_top-neighborhood: idx_top+neighborhood]
            plt.plot(extracted_depths-depths[well_index][idx_top], sig, "-", alpha=0.5)
        plt.xlabel("Relative depth around marker")
        plt.plot([0,0], [-3 ,3], "k--", alpha=0.5, label="Marker location")
        plt.ylim(-3,3)
        plt.legend()
        plt.title(top_type)
        plt.grid()
    plt.suptitle(f"{total_signals} marker patterns visualized in a {2*neighborhood} depth neighborhood")
    plt.show()

In [None]:
top_indexes = np.array(dict_markers["top_index"])
for total_signals in [20, 100, 600]:
    neighborhood = 64
    signal = dict_markers["logs"]
    depths = dict_markers["depths"]
    plt.figure(figsize=(15,5))
    for top_type_index, top_type in enumerate(dict_markers["top_names"]):
        plt.subplot(1,3,top_type_index+1)
        for well_index in range(total_signals):
            idx_top = top_indexes[well_index][top_type_index]
            if np.isnan(idx_top):
                continue
            idx_top = int(idx_top)
            sig = signal[well_index][idx_top-neighborhood: idx_top+neighborhood].clip(0,None)
            sig = (sig - sig.mean())/(sig.std()+1e-10)
            extracted_depths = depths[well_index][idx_top-neighborhood: idx_top+neighborhood]
            plt.plot(extracted_depths-depths[well_index][idx_top], sig, "-", alpha=0.5)
        plt.xlabel("Relative depth around marker")
        plt.plot([0,0], [-3 ,3], "k--", alpha=0.5, label="Marker location")
        plt.ylim(-3,3)
        plt.legend()
        plt.title(top_type)
        plt.grid()
    plt.suptitle(f"{total_signals} marker patterns visualized in a {2*neighborhood} depth neighborhood")
    plt.show()

### Inference code
- You are now provided several signature per markers. 
- Implement the inference code, the evaluation code
- Quantify the results on the test set: compare several distance metrics
- Improve the results: offset well analysis, ordering.

In [None]:
from report.patterns_extraction import get_marker_data, extract_markers_logs, find_template_patterns
%load_ext autoreload
%autoreload 2
dict_markers = get_marker_data()
dict_patterns = extract_markers_logs(dict_markers)

In [None]:
find_template_patterns(dict_patterns)