In [1]:
import math
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import pickle
import ipywidgets as widgets
# from utils import *
pd.set_option('display.max_columns', 30)

In [2]:
# map of lithology codes to description
lithology_keys = {30000: 'Sandstone',
                 65030: 'Sandstone/Shale',
                 65000: 'Shale',
                 80000: 'Marl',
                 74000: 'Dolomite',
                 70000: 'Limestone',
                 70032: 'Chalk',
                 88000: 'Halite',
                 86000: 'Anhydrite',
                 99000: 'Tuff',
                 90000: 'Coal',
                 93000: 'Basement'}
    
# map of lithology codes to integer label for ML
lithology_numbers = {30000: 0,
                     65030: 1,
                     65000: 2,
                     80000: 3,
                     74000: 4,
                     70000: 5,
                     70032: 6,
                     88000: 7,
                     86000: 8,
                     99000: 9,
                     90000: 10,
                     93000: 11}

# Submission

In [3]:
class Model(object):
    def __init__(self, models_file,
                 modelF_file,
                 modelDTC_file,
                 dictF_file, 
                 median_trn_file,
                 feats_name_file):
        # Load pre-trained model from file
        self.models = pickle.load(open(models_file, 'rb'))
        # Load dict
        self.dictF = pickle.load(open(dictF_file, 'rb'))
        #load median values statistics
        self.median_trn = pickle.load(open(median_trn_file, 'rb'))
        # load models for filling missing data
        self.modelF = pickle.load(open(modelF_file, 'rb'))
        self.modelDTC = pickle.load(open(modelDTC_file, 'rb'))
        self.feats_name =  pickle.load(open(feats_name_file, 'rb'))
        
    def _preprocess(self, features):
        # Method to be run before inference. Contains things like
        # stripping unwanted columns, replacing NaNs
        features = features[['WELL','DEPTH_MD','X_LOC','Y_LOC','GROUP',
                            'FORMATION','CALI','RSHA','RMED','RDEP','RHOB','GR',
                            'NPHI', 'PEF', 'DTC', 'SP', 'BS', 'DRHO']]
        features = self.fillGROUP(features)
        features = self.fillRDEP(features)
        features = self.fillXYLOC(features)
        features = self.fillFORMATION(features)
        features = self.fillDTC(features)
        # filling remaining missing data using train statictis
        features.fillna(self.median_trn, inplace=True)
        features.drop(columns=['WELL','DTC_media', 'GROUP','GcatCodes', 'FcatCodes'], inplace=True)        
        features = pd.get_dummies(features, columns=['FORMATION'], drop_first=True)
        features = self.fillFeatures(features)
        return features
        
    def predict(self, features):
        # This function should be able to take in features in their
        # raw, unprocessed form as read from the file test.csv and
        # return predictions as an array integers of the same length
        features = self._preprocess(features)
        preds = np.zeros((len(features), 12))
        for clf in self.models:
            preds += clf.predict_proba(features)
        return np.argmax(preds, axis=-1)
    
    def fillFeatures(self, features):
        # create a container with same columns of training data.
        test_features = pd.DataFrame(columns=self.feats_name, data=np.zeros((len(features), len(self.feats_name))))
        features = test_features + features
        features.fillna(0, inplace=True)
        features = features[self.feats_name]
#         print(features.keys())
        return features
    
    def fillGROUP(self, features):
        df_ = features.copy()
        for well in df_.WELL[df_.GROUP.isna()].unique():
            df_.GROUP.loc[df_.WELL==well] = df_.GROUP.loc[df_.WELL==well].fillna(method='bfill')
        return df_
    
    def fillRDEP(self, features):
        df_ = features.copy()
        for well in df_.WELL[df_.RDEP.isna()].unique():
            data = df_.RDEP.loc[df_.WELL==well]
            df_.RDEP.loc[df_.WELL==well] = data.fillna(data.median())
        return df_
    
    def fillXYLOC(self, features):
        df_ = features.copy()
        for well in df_.WELL.unique():
            X_values = df_.X_LOC.loc[df_.WELL==well]
            Y_values = df_.Y_LOC.loc[df_.WELL==well]
            if X_values.isna().sum() > 0:
                X_values = X_values.fillna(method='bfill')
                df_.X_LOC.loc[df_.WELL==well] = X_values.fillna(method='ffill')
            if Y_values.isna().sum() > 0:
                Y_values = Y_values.fillna(method='bfill')
                df_.Y_LOC.loc[df_.WELL==well] = Y_values.fillna(method='ffill')
        return df_
    
    def fillFORMATION(self, features):
        df_ = features.copy()
        dfF = df_[['WELL','DEPTH_MD','GROUP', 'GR', 'RDEP', 'FORMATION']]
        dfF.GROUP = dfF.GROUP.astype('category')
        dfF['GcatCodes'] = dfF.GROUP.cat.codes

        wells = dfF.WELL.unique()
        window = 100

        df_temp = []
        for well in wells:
            DatabyWell = dfF.loc[dfF.WELL==well].sort_values(by='DEPTH_MD')
            DatabyWell['GR_median']= DatabyWell['GR'].rolling(window).median()
            DatabyWell['RDEP_median']= DatabyWell['RDEP'].rolling(window).median()
            DatabyWell['GR_std'] = DatabyWell['GR'].rolling(window).std()    
            DatabyWell['RDEP_std'] = DatabyWell['RDEP'].rolling(window).std()

            DatabyWell.GR_median = DatabyWell.GR_median.fillna(method='bfill')
            DatabyWell.RDEP_median = DatabyWell.RDEP_median.fillna(method='bfill')

            DatabyWell.GR_std = DatabyWell.GR_std.fillna(method='bfill')    
            DatabyWell.RDEP_std = DatabyWell.RDEP_std.fillna(method='bfill')

            df_temp.append(DatabyWell)

        dfF = pd.concat(df_temp)
        dfF = dfF.loc[dfF.FORMATION.isna()]
        X_Data = dfF[['DEPTH_MD','GR','GR_median','GR_std','RDEP','RDEP_median','RDEP_std','GcatCodes']]

        pred_F = self.modelF.predict(X_Data)

        dfF.FORMATION = dfF.FORMATION.astype('str')
        dfF.FORMATION.loc[dfF.index] = [self.dictF[pred_F[i]] for i in range(len(pred_F))]
        df_.FORMATION.loc[dfF.index] = dfF.FORMATION
        return df_
    
    def fillDTC(self, features, window_dtc=50):
        df_ = features.copy()
        df_.GROUP = df_.GROUP.astype('category')
        df_.FORMATION = df_.FORMATION.astype('category')
        df_['GcatCodes'] = df_.GROUP.cat.codes
        df_['FcatCodes'] = df_.FORMATION.cat.codes
        df_['DTC_media'] = 0
        for well in df_.WELL.unique():
            df_.DTC_media.loc[df_.WELL==well] = self.modelDTC.predict(df_[df_.WELL==well][['DEPTH_MD','GR','GcatCodes','FcatCodes']])
            df_.DTC_media.loc[df_.WELL==well] = df_.DTC_media.loc[df_.WELL==well].rolling(window_dtc).median()
            df_.DTC_media.loc[df_.WELL==well] = df_.DTC_media.loc[df_.WELL==well].fillna(method='bfill')
            df_.DTC.loc[df_.DTC.isna()] = df_.DTC_media.loc[df_.DTC.isna()]        
        return df_

In [5]:
model = Model(models_file='models.pkl', 
              modelF_file='model_F.pkl',
              modelDTC_file='model_DTC.pkl',
              dictF_file='dictF.pkl' ,
              median_trn_file='train_median.pkl',
              feats_name_file='feat_Columns.pkl')

In [7]:
open_test_features = pd.read_csv('test.csv', sep=';')

In [8]:
test_prediction = model.predict(open_test_features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying t

In [9]:
category_to_lithology = {y:x for x,y in lithology_numbers.items()}

In [10]:
test_prediction_for_submission = np.vectorize(category_to_lithology.get)(test_prediction)

In [13]:
# target = pd.read_csv('/scratch/parceirosbr/bigoilict/share/GeoFacies/Maykol/Force/notebooks/sub41.csv')
# target = pd.read_csv('/scratch/parceirosbr/bigoilict/share/GeoFacies/Force/best_8features.csv')
# best_8features_cor.csv

In [14]:
(target.values.ravel()==test_prediction_for_submission).sum()/len(target)

0.9502726887254543

In [16]:
np.savetxt('test_predictions_jb.csv', test_prediction_for_submission, header='lithology', comments='', fmt='%i')

In [15]:
!pip freeze > requirements.txt