In [1]:
# Script by Bohdan Pavlyshenko, b.pavlyshenko@gmail.com, https://www.linkedin.com/in/bpavlyshenko/
#
# Some code in this script was taken from the starter jupyter notebook from 
# competition 'FORCE: Machine Predicted Lithology' (https://xeek.ai/challenges/force-well-logs/overview)

In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import LogisticRegression

In [3]:
def score(y_true, y_pred):
    S = 0.0
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)
    for i in range(0, y_true.shape[0]):
        S -= A[y_true[i], y_pred[i]]
    return S/y_true.shape[0]
def feval_f(y_pred, dset):
    y_true = dset.get_label()
    y_pred1=y_pred.reshape(12,-1)
    y_v=np.argmax(y_pred1,axis=0)
    score_v=-score(y_true, y_v)
    return('score', score_v, False)

In [None]:
# Set up options
n_splits=10
test_size=0.15
# directory for models and model predictions
model_dir='models/'
file_dir='stacking/'
# load penalty matrix 
A = np.load('data/penalty_matrix.npy')
# load train and test sets
data_train = pd.read_csv('data/train.csv', sep=';')
data_test=pd.read_csv('data/test.csv', sep=';')

In [None]:
ntrain=data_train.shape[0]
data=pd.concat([data_train,data_test],axis=0)

enc=LabelEncoder()
data.FORMATION=enc.fit_transform(data.FORMATION.astype(str))
pickle.dump(enc, open(model_dir+'formation_enc.pkl', 'wb'))

enc=LabelEncoder()
data.GROUP=enc.fit_transform(data.GROUP.astype(str))
pickle.dump(enc, open(model_dir+'group_enc.pkl', 'wb'))

data_train=data.iloc[:ntrain,:].copy()
data_test=data.iloc[ntrain:,:].copy()
data_train['y'] = data_train['FORCE_2020_LITHOFACIES_LITHOLOGY']

# This part of the code is based on the code from the starter jupyter notebook 
lithology_numbers = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}
data_train['y'] = data_train['y'].map(lithology_numbers)
######################################################################

features_set1=['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'GROUP', 'FORMATION',
       'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR','SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO']

features_set2=['Z_LOC', 'GROUP', 'FORMATION',
       'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'SGR', 'NPHI', 'PEF',
       'DTC', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT', 'RMIC',
       'ROPA', 'RXO']

par_list={
'par1':{
'features':features_set1,
'parameters':  {
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 25,
    'feature_fraction': 0.15,
    'bagging_fraction': 0.05,
    'learning_rate': 0.1,
     'max_depth':5,
    "lambda_l2" : 3,
    'num_iterations':100,
    'metric':'None',
    'verbose':-1
}
},

'par2':{
'features':features_set1,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 300,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth':7,
    "bagging_seed" : 15,
     "seed": 15,
    'metric':'None',
    'verbose':-1,
    'num_iterations':450,
}},


'par3':{
'features':features_set1,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 25,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.15,
    'learning_rate': 0.01,
    'max_depth':7,
    "lambda_l2" : 10,
    "bagging_seed" : 15,
     "seed": 15,
    'min_data_in_leaf': 150, 
    'bagging_freq': 30,
    'num_iterations':650,
    'metric':'None',
    'verbose':-1
}},
    
'par4':{
'features':features_set1,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 500,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.01,
    'learning_rate': 0.015,
    'max_depth':7,
    "bagging_seed" : 15,
     "seed": 15,
    'metric':'None',
    'verbose':-1,
    'num_iterations':300,
}},

'par5':{
'features':features_set1,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 250,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.05,
    'learning_rate': 0.015,
    'max_depth':7,
    "lambda_l2" : 10,
    "bagging_seed" : 15,
     "seed": 15,
    'min_data_in_leaf': 150, 
    'bagging_freq': 30,
    'num_iterations':250,
    'metric':'None',
    'verbose':-1
}},
  
'par6':{
'features':features_set1,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 500,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.15,
    'learning_rate': 0.01,
    'max_depth':5,
    "bagging_seed" : 15,
     "seed": 15,
    'min_data_in_leaf': 150, 
    'bagging_freq': 30,
    'num_iterations':550,
    'metric':'None',
    'verbose':-1
}},
    
'par7':{
'features':features_set1,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 100,
    'feature_fraction': 0.25,
    'bagging_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth':-1,
    "lambda_l2" : 1,
    "bagging_seed" : 15,
     "seed": 15,
    'num_iterations':550,
    'metric':'None',
    'verbose':-1
}},

    'par8':{
'features':features_set2,
'parameters':{
    'objective': 'multiclass',
    'num_class':12,
    'metric': 'multi_logloss',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 100,
    'feature_fraction': 0.35,
    'bagging_fraction': 0.1,
    'learning_rate': 0.01,
    'max_depth':7,
    "bagging_seed" : 15,
     "seed": 15,
    'num_iterations':500,
    'metric':'None',
    'verbose':-1
}}
}

for i_par in par_list.keys():
    print('\nparameters set',i_par )
    features=par_list[i_par]['features']
    parameters=par_list[i_par]['parameters'].copy()
    print('cross-validation')
    gfld =GroupShuffleSplit(n_splits=n_splits,test_size=test_size, random_state=15)
    train_d = lgb.Dataset(data_train[features], label=data_train.y,
                                  categorical_feature=['FORMATION','GROUP'],free_raw_data=False)
    model=lgb.cv(parameters, train_d,  folds=gfld.split(data_train, data_train.y, data_train.WELL),\
                 return_cvbooster=True)
    boosters=model['cvbooster'].boosters
    i=0
    pred_list=[]
    y_arr_list=[]
    save_y=0
    gfld =GroupShuffleSplit(n_splits=n_splits,test_size=test_size, random_state=15)
    for train_indx, test_indx in gfld.split(data_train, data_train.y, data_train.WELL):
        pred=boosters[i].predict(data_train.iloc[test_indx][features])
        i=i+1
        y_arr_list.append(data_train.iloc[test_indx][['y']].values)
        pred_list.append(pred)
    predcv=np.vstack(pred_list)
    if (save_y==0):
        save_y=1
        ycv=np.vstack(y_arr_list)
        pd.DataFrame(ycv,columns=['y']).to_csv(file_dir+'y.csv',index=False)
    pd.DataFrame(predcv.round(5)).to_csv(file_dir+i_par+'.csv', index=False)
    cv_pred_res=np.argmax(predcv,axis=1)
    score_cv=score(ycv,cv_pred_res)
    print ('score',score_cv)
    print ('test prediction')
    train_dt=lgb.Dataset(data_train[features], label=data_train.y,categorical_feature=['FORMATION','GROUP'])
    model = lgb.train(parameters,train_dt)
    pickle.dump(model, open(model_dir+'lgb_model_'+str(i_par)+'.pkl', 'wb'))
    test_pred_p = model.predict(data_test[features])
    pd.DataFrame(test_pred_p.round(5)).to_csv(file_dir+i_par+'_test.csv', index=False)

In [None]:
stack_list=list(par_list.keys())
stack_list1=['par1', 'par2', 'par3', 'par4', 'par5', 'par6', 'par7', 'par8']
n_stack_list=len(stack_list)
y_df=pd.read_csv(file_dir+'y.csv')
stack_train_df=[]
stack_test_df=[]
for i_par in stack_list:
    trn_st=pd.read_csv(file_dir+i_par+'.csv')
    stack_train_df.append(trn_st)
    tst_st=pd.read_csv(file_dir+i_par+'_test.csv')
    stack_test_df.append(tst_st)
    
X=np.hstack(stack_train_df)
Xtest=np.hstack(stack_test_df)
ntrn=X.shape[0]
ntest=Xtest.shape[0]

test_res_p=np.zeros([ntest,12])
trn_res_p=np.zeros([ntrn,12])
for i in np.arange(12):
    print ('class',i)
    y=np.zeros(ntrn)
    y[y_df.y==i]=1
    lr=LogisticRegression(n_jobs=12)
    lr.fit(X,y)
    pickle.dump(lr, open(model_dir+'lr_model_'+str(i)+'.pkl', 'wb'))
    test_res_p[:,i]=lr.predict_proba(Xtest)[:,1]
    trn_res_p[:,i]=lr.predict_proba(X)[:,1]

trn_pred=np.argmax(trn_res_p,axis=1)
cv_score=score(y_df.y.values,trn_pred)
print(f'cv_score:{cv_score}')

test_pred=np.argmax(test_res_p,axis=1)

# This part of the code is from the starter jupyter notebook 
category_to_lithology = {y:x for x,y in lithology_numbers.items()}
test_prediction_for_submission = np.vectorize(category_to_lithology.get)(test_pred)
np.savetxt('prediction.csv', test_prediction_for_submission, header='lithology', comments='', fmt='%i')
################################################################################
print("Done!")