In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, plot_confusion_matrix

from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm

plt.style.use("ggplot")
%load_ext autoreload
%autoreload 2

In [2]:
data_folder = "data/"

In [3]:
from utils import reduce_mem_usage, WellLogsProcessing, plot_features_importance, penalty_score

In [4]:
dest_file = data_folder + "train.csv"
train_df = pd.read_csv(data_folder + "train.csv",sep=";")

In [5]:
train_df,na = reduce_mem_usage(train_df)

Memory usage of properties dataframe is : 258.9785385131836  MB
******************************
Column:  DEPTH_MD
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  X_LOC
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  Y_LOC
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  Z_LOC
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  CALI
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  RSHA
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  RMED
dtype before:  float64
dtype after:  float32
******************************
******************************
Column:  RDEP
dtype before:  float64
dtype after:  

## Splitting data per well.

In [6]:
### get train wells and fit imputer on them
###93000 is only in well 16/4-1
all_lith  = train_df.FORCE_2020_LITHOFACIES_LITHOLOGY.unique()
all_wells  = train_df.WELL.unique()

###Keep 4 wells for val/test sets
val_wells = ["16/10-1","16/2-16"]
test_wells = ["16/11-1 ST3","34/3-1 A"]
train_wells = [well_name for well_name in all_wells if well_name not in val_wells+test_wells]

In [7]:
full_df = train_df.loc[train_df.WELL.isin(train_wells),:]

## data processing

Features selection have been done from data_analysis notebooks. Those features are either very discriminative or with very few null values.

In [8]:
numeric_features = ['RMED', 'RDEP', 'RHOB', 'GR', 'NPHI','DTC','RXO','RSHA',
                    "X_LOC","Y_LOC","Z_LOC",'DEPTH_MD']
# numeric_features = ['RMED', 'RDEP', 'RHOB', 'GR', 'NPHI','DTC',"X_LOC","Y_LOC","Z_LOC",'DEPTH_MD']
# numeric_features = ['RMED', 'RDEP','DRHO','CALI',
#                     'RHOB', 'GR', 'NPHI','DTC',"DTS",'PEF','SP','RXO','RSHA',
#                     "RMIC","X_LOC","Y_LOC","Z_LOC",'DEPTH_MD']

cat_features = ['GROUP','FORMATION']

others_numeric=[]

all_formations = list(full_df.FORMATION.unique())
all_formations.remove(np.nan)
all_formations = sorted(all_formations)

all_groups = list(full_df.GROUP.unique())
all_groups.remove(np.nan)
all_groups = sorted(all_groups)

#### Instanciate Logs Processing pipeline. Here we will only fill numerical empty values
logs_processor = WellLogsProcessing(numeric_features ,cat_features ,
                                    others_numeric,all_formations ,all_groups,
                                    remove_outliers=False,process_num_features=True,
                                      impute_categorical = False, encode_categorical=False)

In [None]:
######### Numerical features empty values imputation and scaling
filled_full_data = logs_processor.get_processed_data(full_df.loc[:,full_df.columns],is_train_data=True)

Fitting iterative imputer
[IterativeImputer] Completing matrix with shape (1125622, 12)
[IterativeImputer] Change: 103628.125, scaled tolerance: 6856.661 
[IterativeImputer] Change: 336598.71875, scaled tolerance: 6856.661 
[IterativeImputer] Change: 153881.625, scaled tolerance: 6856.661 
[IterativeImputer] Change: 52670.0859375, scaled tolerance: 6856.661 


In [None]:
filled_full_data.info()

## GROUP,FEATURE PREDICTION
Now we will predict GROUP AND FORMATION features instead of just imputing them with the mode value.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from utils import predict_target, get_predictor

In [None]:
### GROUP prediction. We define predictors, classifier and we train and predict GROUP feature

group_predictors = ['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC','RMED', 'RDEP', 'RHOB', 'GR', 'NPHI',
                       'DTC',"RSHA"]
catboost_classifier = CatBoostClassifier(iterations=500,learning_rate=0.1,depth = 4,
                                            min_data_in_leaf = 50,
                                             bagging_temperature = 10,
                                            grow_policy = 'Depthwise',objective='MultiClassOneVsAll',
                                            custom_metric = ['Accuracy'],
                                            early_stopping_rounds = 100,task_type='GPU',
                                            train_dir = data_folder)
filled2, group_imputer, group_predictors_scaler = get_predictor(filled_full_data.loc[:,filled_full_data.columns],
                                                                  catboost_classifier,
                                                                   target='GROUP',regressors=group_predictors)

In [None]:
plot_features_importance(group_imputer.feature_importances_,group_imputer.feature_names_,figsize=(20,20))

- We splitted samples having a non null GROUP feature in train/val sets.
- After training, our GROUP classifier is accurate at 88% on val set. So we can use it to impute GROUP feature

In [None]:
filled2["enc_GROUP"] = filled2.GROUP.map(all_groups.index).astype(int).astype("category")

In [None]:
### FORMATION prediction. We define predictors, classifier and we train and predict FORMATION feature
formation_predictors = ['DEPTH_MD',"enc_GROUP", 'X_LOC', 'Y_LOC', 'Z_LOC','RMED', 'RDEP', 'RHOB', 'GR', 'NPHI',
                       'DTC',"RSHA"]

target = "FORMATION"
cat_1 = formation_predictors.index("enc_GROUP")
encoded_cat = ["enc_GROUP"]

catboost_classifier = CatBoostClassifier(iterations=800,learning_rate=0.1,depth = 4,
                                            min_data_in_leaf = 50,
                                            bagging_temperature = 10,
                                            cat_features = [cat_1],
                                            grow_policy = 'Depthwise',objective='MultiClass',
                                            custom_metric = ['Accuracy'],
                                            early_stopping_rounds = 100,task_type='GPU',
                                            train_dir = data_folder)
filled3, formation_imputer, formation_predictors_scaler = get_predictor(filled2.loc[:,filled2.columns],
                                                                  catboost_classifier,
                                                                   target=target,
                                                                regressors=formation_predictors,
                                                                           encoded_cat=encoded_cat)

In [None]:
plot_features_importance(formation_imputer.feature_importances_,
                         formation_imputer.feature_names_,figsize=(20,20))

In [None]:
mask = filled2.FORMATION.notna()
num_features = list(set(formation_predictors) - set(encoded_cat))
vali  = filled2.loc[mask,num_features]
vali[vali.columns] = formation_predictors_scaler.transform(vali)
vali[encoded_cat] = filled2.loc[mask,encoded_cat]
print("Accuracy of FORMATION classifier is : ",accuracy_score(filled2.loc[mask,"FORMATION"].values,
                                              formation_imputer.predict(vali[formation_predictors])))


## Final step
fill null values for train/val/test sets

In [None]:
##let's fill the whole dataset and save everything
filled_whole = logs_processor.get_processed_data(train_df.loc[:,train_df.columns])

In [None]:
##fill GROUP
filled_whole2 = filled_whole.loc[:,filled_whole.columns]
mask = filled_whole2.GROUP.isna()
if mask.any():
    vali = group_predictors_scaler.transform(filled_whole2.loc[mask,group_predictors])
    filled_whole2.loc[mask,"GROUP"] = group_imputer.predict(vali)

In [None]:
## fill FORMATION
filled_whole2["enc_GROUP"] = filled_whole2.GROUP.map(lambda x: logs_processor.all_groups.get(x,1000)).astype(int).astype("category")

filled_whole3 = filled_whole2.loc[:,filled_whole2.columns]

encoded_cat = ["enc_GROUP"]
num_features = list(set(formation_predictors) - set(encoded_cat))


mask = filled_whole3.FORMATION.isna()
if mask.any():
    vali  = filled_whole3.loc[mask,num_features]
    vali[vali.columns] = formation_predictors_scaler.transform(vali)
    vali[encoded_cat] = filled_whole3[encoded_cat]
    filled_whole3.loc[mask,"FORMATION"] = formation_imputer.predict(vali[formation_predictors])

## Saving 
Now we save processed data and processing pipeline.

In [None]:
 from joblib import dump, load

In [None]:
filled_whole3.loc[:,['FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE']] = train_df.loc[:,['FORCE_2020_LITHOFACIES_LITHOLOGY','FORCE_2020_LITHOFACIES_CONFIDENCE']]

In [None]:
filled_whole3,na = reduce_mem_usage(filled_whole3.drop(columns="enc_GROUP"))

In [None]:
#### All features are filled. The only exception is FORCE_2020_LITHOFACIES_CONFIDENCE 
#but it doesn't matter since we are not using 
filled_whole3.info()

In [None]:
filled_whole3.to_csv(data_folder+"filled_train.csv",index=False)

In [None]:
logs_processor.save_processing_params(data_folder=data_folder)

In [None]:
###### Save GROUP classifier
group_name = "group_imputer"
group_dict = {}
group_dict["predictors"] = group_predictors
group_dict["imputer"] = group_imputer
group_dict["scaler"] = group_predictors_scaler

dump(group_dict,data_folder+group_name+".joblib")

###### Save FORMATION classifier
formation_name = "formation_imputer"
formation_dict = {}
formation_dict["predictors"] = formation_predictors
formation_dict["imputer"] = formation_imputer
formation_dict["scaler"] = formation_predictors_scaler

dump(formation_dict,data_folder+formation_name+".joblib")

This is the end of data processing. We did null values imputation, categorical features encoding and numeric features scaling. 