In [1]:
import pandas as pd

In [2]:
import numpy as np

# map of lithologies to encoding
lithology_map = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}

# encoding and decoding functions
def lithology_encode(lithology_array):
    return np.vectorize(lithology_map.get)(lithology_array)

def lithology_decode(lithology_array):
    inverse_lithology_map = { value : key for key, value in lithology_map.items() }
    return np.vectorize(inverse_lithology_map.get)(lithology_array)

In [3]:
from itertools import combinations

# create feature function
def create_features(df):

    df.sort_values(by=['WELL', 'DEPTH_MD'], inplace = True)

    diff_columns = ['GR', 'NPHI', 'RHOB', 'DTC']

    for column in diff_columns:

        df[column + "_MEAN_DIFF"] = df[column].sub(df.groupby('WELL')[column].transform('mean'))

    for column1, column2 in combinations(diff_columns, 2):

        df[column1 + "_" + column2 + "_" + "DIFF"] = df[column1] - df[column2]

In [4]:
from fastai.tabular import core
import xgboost as xgb

# declare target
dependent_variable = 'FORCE_2020_LITHOFACIES_LITHOLOGY'
    
# read in the training data
df_training = pd.read_csv('train.csv', sep = ';').drop(['FORCE_2020_LITHOFACIES_CONFIDENCE'], axis = 1)
df_training['FORCE_2020_LITHOFACIES_LITHOLOGY'] = lithology_encode(df_training['FORCE_2020_LITHOFACIES_LITHOLOGY'])
df_training['PHASE'] = 0

# read in the test data
df_test = pd.read_csv('test.csv', sep = ';')
df_test['FORCE_2020_LITHOFACIES_LITHOLOGY'] = 0
df_test['PHASE'] = 1

# combine training and test data
df = pd.concat([df_training,df_test], axis=0).reset_index()

# create features
create_features(df)
df.sort_index(inplace=True)
df.drop('WELL', axis=1, inplace=True)

# find splits
train_indexes = list(df[df['PHASE'] == 0].index)
validation_indexes = list(df[df['PHASE'] == 1].index)

# make split tuple
splits = (train_indexes, validation_indexes)
df.drop('PHASE', axis=1, inplace=True)

# declare preprocessing functions
processing_functions = [core.Categorify, core.FillMissing]

# determine categorical and continuous features
continuous_vars, categorical_vars = core.cont_cat_split(df, max_card = 1, dep_var = dependent_variable)
    
# create tabular object
table = core.TabularPandas(df, procs = processing_functions, cat_names = categorical_vars, cont_names = continuous_vars, y_names = dependent_variable, splits = splits)

# identify unhelpful columns
misleading_columns = ['DRHO_na', 'RMIC', 'DTS_na', 'GR_RHOB_DIFF_na', 'X_LOC_na', 'PEF_na', 'ROPA', 'RHOB_MEAN_DIFF_na', 'NPHI_DTC_DIFF_na', 'CALI_na', \
                      'Z_LOC_na', 'RHOB_DTC_DIFF_na', 'GR_DTC_DIFF_na', 'RSHA_na', 'GR_NPHI_DIFF_na', 'DTC_MEAN_DIFF_na', 'NPHI_MEAN_DIFF_na', 'ROP', 'MUDWEIGHT', 'RXO', \
                      'DRHO', 'Y_LOC_na', 'RDEP_na', 'SP', 'RMIC_na', 'ROPA_na', 'RXO_na', 'MUDWEIGHT_na', 'NPHI_RHOB_DIFF_na', 'RHOB_na', 'RMED_na', 'NPHI_na', \
                      'SP_na', 'DTC_na', 'GR_NPHI_DIFF']
    
# separate out training and test data
X_train, y_train = table.train.xs.drop(misleading_columns, axis=1), table.train.y
X_test = table.valid.xs.drop(misleading_columns, axis=1)

# declare classifier
clf = xgb.XGBClassifier(
        n_estimators = 108,
        learning_rate = 0.058341077396837984,
        max_depth = 10,
        min_child_weight = 3.975751644687145,
        gamma = 3.2978917680925592,
        subsample = 0.660954379979485,
        colsample_bytree = 0.35601189656690413,
        tree_method = 'gpu_hist'
    )

# fit classifier
clf.fit(X_train, y_train)

# find predictions
predictions = clf.predict(X_test)

# rename predictions
decoded_predictions = lithology_decode(predictions)

# create dataframe
pd.DataFrame(data=decoded_predictions, columns=['lithology']).to_csv('predictions.csv', index=False)