In [None]:
import numpy as np
import pandas as pd
from functools import reduce
import joblib
from category_encoders.target_encoder import TargetEncoder
import lightgbm as lgb
from pathlib import Path

In [None]:
class Model:
    def __init__(self):
        self.targets = [0, 1, 2, 3, 4, 5, 6, 7, 8]
        # Load pre-trained model from file
#         self.model = pickle.load(open(model_file, 'rb'))
        # Load a "pre-trained" scaler from file
#         self.scaler = pickle.load(open(scaler_file, 'rb'))
    
    @staticmethod
    def _check_depth_step_(df, step):
        residual = (df['Depth'] - df.groupby('Well')['Depth'].shift()).fillna(step)
        residual = residual - step
        msk = (residual != 0)
        print('## Number of incorrect depth samples: ', msk.sum())
        outlier_wells = df.loc[msk]['Well'].unique()
        print('## Wells with incorrect depth step: ', outlier_wells)
        
    def _preprocess_raw_(self, df: pd.DataFrame) -> pd.DataFrame:
        print('# Drop & rename')
        df = df.copy()
        df.drop(columns=['SGR'], inplace=True)  # not in test
        df.rename(columns={
                    'WELL':'Well', 
                    'DEPTH_MD':'Depth', 
                        }, inplace=True)

        df['Depth'] = 1000 * df['Depth']
        df['Depth'] = df['Depth'].round().astype(int)
        df = df.sort_values(by=['Well', 'Depth'])
        
        print('# Check dublicates:')
        if df.duplicated(subset=['Well', 'Depth'], keep=False).any(axis=0):
            raise Exception('Data containes dublicates')
        else:
            print('Not Found')

        step = 152
        self._check_depth_step_(df, step)
        
        print('# Apply Rebin')
        def rebin(df_well, step=step):
            start = df_well['Depth'].min()
            end = df_well['Depth'].max()
            num = (end - start) / step + 1
            depthes = np.linspace(start, end, round(num), dtype=np.int)
            return depthes

        rebined_raw = df.groupby('Well').apply(rebin)
        rebined_raw = rebined_raw.to_frame('Depth')
        df_base = rebined_raw.explode('Depth')
        df_base['Depth'] = df_base['Depth'].astype(int)
        df_base.reset_index(inplace=True)
        df_m = df_base.merge(df, how='outer', on=['Well', 'Depth'])

        print('# Fill missing values')
        df_m['GR'] = df_m.groupby('Well')[['GR']].apply(lambda group: group.interpolate(method='linear', limit_direction='both'))

        print('# Fill nan values')
        df_m['Sector'] = df_m['Well'].apply(lambda x: x.split('/')[0])
        df_m['Sector'] = df_m['Sector'].astype(int)
        
        df_m['GROUP'] = df_m['GROUP'].fillna('Unknown')
        df_m['GROUP'] = df_m['GROUP'].astype(str)
        
        df_m['FORMATION'] = df_m['FORMATION'].fillna('Unknown')
        df_m['FORMATION'] = df_m['FORMATION'].astype(str)
        
        print('# Set Index')
        df_m = df_m.set_index(keys=['Well', 'Depth'], verify_integrity=True)
        return df_m    
    
    
    @staticmethod
    def _make_features_(df: pd.DataFrame) -> pd.DataFrame:
        preprocess_features = ['GR',  'X_LOC', 'Y_LOC', 'Z_LOC',
                    'CALI', 'NPHI', 'RMED', 'RDEP', 'SP', 
                    'DTC', 'RHOB', 'BS', 'DTS', 'MUDWEIGHT', 'DCAL']

        lag_space = np.ceil(np.logspace(0, 10, num=10, base=2, endpoint=False)).astype(int)
        window_space = np.ceil(np.logspace(2, 5, num=8, base=3, endpoint=True)).astype(int)
        min_max_space = np.ceil(np.logspace(2, 5, num=8, base=3, endpoint=True)).astype(int)
        std_space = np.ceil(np.logspace(2, 5, num=8, base=3, endpoint=True)).astype(int)
        diffs_space = np.ceil(np.logspace(0, 10, num=10, base=2, endpoint=False)).astype(int)
        gradient_space = np.linspace(1, 6, num=6).astype(int)

        def _create_lag_tao_i(data, keys, i):
            df_lag_tao_i = data.groupby(['Well'])[keys].apply(lambda series: series.shift(i))
            df_lag_tao_i = df_lag_tao_i.rename(columns={key: key + f'_Lag_Tao+{i}' for key in keys})
            return df_lag_tao_i

        def _create_rolling_mean(data, keys, window, tao):
            df_rolling_mean = data.groupby(['Well'])[keys].apply(lambda series: series.shift(tao).rolling(window=window, min_periods=2).mean())
            df_rolling_mean = df_rolling_mean.rename(columns={key: key + f'_Rolling_Mean_Shift={tao}_Window={window}' for key in keys})
            return df_rolling_mean

        def _create_rolling_std(data, keys, window, tao):
            df_rolling_std = data.groupby(['Well'])[keys].apply(lambda series: series.shift(tao).rolling(window=window, min_periods=2).std())
            df_rolling_std = df_rolling_std.rename(columns={key: key + f'_Rolling_STD_Shift={tao}_Window={window}' for key in keys})
            return df_rolling_std

        def _create_rolling_max(data, keys, window, tao):
            df_rolling_mean = data.groupby(['Well'])[keys].apply(lambda series: series.shift(tao).rolling(window=window, min_periods=2).max())
            df_rolling_mean = df_rolling_mean.rename(columns={key: key + f'_Rolling_Max_Shift={tao}_Window={window}' for key in keys})
            return df_rolling_mean

        def _create_rolling_min(data, keys, window, tao):
            df_rolling_mean = data.groupby(['Well'])[keys].apply(lambda series: series.shift(tao).rolling(window=window, min_periods=2).min())
            df_rolling_mean = df_rolling_mean.rename(columns={key: key + f'_Rolling_Min_Shift={tao}_Window={window}' for key in keys})
            return df_rolling_mean

        def _create_rolling_diff(data, keys, periods):
            df_rolling_diff = data.groupby(['Well'])[keys].apply(lambda series: series.diff(periods=periods))
            df_rolling_diff = df_rolling_diff.rename(columns={key: key + f'_Rolling_Diff_Period={periods}' for key in keys})
            return df_rolling_diff

        def _create_gradient(data, keys, period, step):
            grid = data.index.to_frame()['Depth'].groupby('Well').diff(period).fillna(step)
            df_rolling_diff = data.groupby(['Well'])[keys].apply(lambda series: series.diff(period))
            df_grad = df_rolling_diff.div(grid, axis=0)
            df_grad = df_grad.rename(columns={key: key + f'_Gradient_Period={period}' for key in keys})
            return df_grad

        data_frames = []

        print('# Lags forward')
        for i in lag_space:
            data_frames.append(_create_lag_tao_i(df, preprocess_features, i))

        print('# Lags inverse backward')
        for i in lag_space:
            data_frames.append(_create_lag_tao_i(df, preprocess_features, -i))

        print('# Rolling mean centered')
        for i in window_space:
            data_frames.append(_create_rolling_mean(df, preprocess_features, i, int(round((i - 1)/2))))

        print('# Rolling max centered')
        for i in min_max_space:
            data_frames.append(_create_rolling_max(df, preprocess_features, i, int(round((i - 1)/2))))

        print('# Rolling min centered')
        for i in min_max_space:
            data_frames.append(_create_rolling_min(df, preprocess_features, i, int(round((i - 1)/2))))

        print('# Rolling std centered') 
        for i in std_space:
            data_frames.append(_create_rolling_std(df, preprocess_features, i, int(round((i - 1)/2))))

        print('# Diffs forward') 
        for i in diffs_space:
            data_frames.append(_create_rolling_diff(df, preprocess_features, i))

        print('# Diffs backward') 
        for i in diffs_space:
            data_frames.append(_create_rolling_diff(df, preprocess_features, -i))

        print('# Gradients forward')
        for i in gradient_space:
            data_frames.append(_create_gradient(df, preprocess_features, i, step=152))

        print('# Gradients backward')
        for i in gradient_space:
            data_frames.append(_create_gradient(df, preprocess_features, -i, step=152))

        print('# Merging')
        df_merged = pd.concat(data_frames, axis=1)
        
        return df_merged
    
    
    def _create_feature_space_(self, df: pd.DataFrame) -> pd.DataFrame:
        all_features = ['X_LOC', 'Y_LOC', 'Z_LOC', 'GROUP', 'FORMATION', 'Sector',
                        'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 
                        'DTC', 'RXO', 'SP', 'BS', 'ROP', 'DTS', 'DCAL', 'DRHO', 'MUDWEIGHT',
                        'RMIC', 'ROPA']
        df = df[all_features].copy()
        ftrs = self._make_features_(df)
        df_fs = pd.concat([df, ftrs], axis=1)
        print('# Done, created {} features'.format(df_fs.shape))
        return  df_fs

    @staticmethod
    def _load_models_(model_name: str) -> (list, list):
        print('Load Models')
        model_path = Path(model_name)
        boost_models_pathes = sorted(model_path.glob('boost/*.pkl'))
        categorizer_pathes = sorted(model_path.glob('categorizer/*.trfrm'))
        boost_models = [joblib.load(boost_m) for boost_m in boost_models_pathes]
        categorizer_models = [joblib.load(cat_m) for cat_m in categorizer_pathes]
        
        assert len(boost_models) == len(categorizer_models)
        
        return categorizer_models, boost_models 

        
    def _apply_model_(self, X, categorizer, model):
        X2 = categorizer.transform(X)
        prediction = model.predict_proba(X2)
        prediction = pd.DataFrame(prediction[:, :], index=X.index, columns=self.targets)
        return prediction
    
    
    def _evaluate_model_(self, X, categorizers, models):
        print('Appling Model')
        preds = []
        for categorizer, model in zip(categorizers, models):
            prediction = self._apply_model_(X, categorizer, model)
            preds.append(prediction)
        return preds
    
    @staticmethod
    def _compine_prediction_(predictions: list) -> pd.DataFrame:
        prediction = pd.concat(predictions, axis=1)
        prediction2 = prediction.groupby(level=[0], axis=1).mean()
#         prediction2 = reduce(lambda x, y: x + y, predictions)
#         prediction2 = prediction2 / len(predictions)
        return prediction2
    
    
    def _preprocess(self, features: pd.DataFrame) -> pd.DataFrame:
        # Method to be run before inference. Contains things like
        # stripping unwanted columns, replacing NaNs, and scaling 
        # or normalizing data
        preimported_data = self._preprocess_raw_(features)
        preprocessed_data = self._create_feature_space_(preimported_data)
        return preprocessed_data
        
        
    def predict(self, features: pd.DataFrame) -> np.ndarray:
        # This function should be able to take in features in their
        # raw, unprocessed form as read from the file test.csv and
        # return predictions as an array integers of the same length
        X = self._preprocess(features)
        
        model_name_1 = 'lgbm_model_1'
        categorizers_1, models_1 = self._load_models_(model_name_1)
        used_features_1 = np.load(f'{model_name_1}/features.npy', allow_pickle=True)
        preds1 = self._evaluate_model_(X[used_features_1], categorizers_1, models_1)
        p1 = self._compine_prediction_(preds1)

        model_name_2 = 'lgbm_model_2'
        categorizers_2, models_2 = self._load_models_(model_name_2)
        used_features_2 = np.load(f'{model_name_2}/features.npy', allow_pickle=True)
        preds2 = self._evaluate_model_(X[used_features_2], categorizers_2, models_2)
        p2 = self._compine_prediction_(preds2)

        P = (p1 + p2) / 2
        p_final = self._postprocess_(P, features.copy())
        return p_final
    
    @staticmethod
    def _postprocess_(y_pred: pd.DataFrame, y_true: pd.DataFrame) -> np.ndarray:
        inv_map = {4: 10, 8: 9}
        y_pred = y_pred.rename(columns=inv_map)
        
        category_to_lithology = {0: 30000,
                                 1: 65030,
                                 2: 65000,
                                 3: 80000,
                                 4: 74000,
                                 5: 70000,
                                 6: 70032,
                                 7: 88000,
                                 8: 86000,
                                 9: 99000,
                                 10: 90000,
                                 11: 93000}

        y_pred = y_pred.idxmax(axis=1).map(category_to_lithology)
        
        y_true.rename(columns={'WELL':'Well', 'DEPTH_MD':'Depth'}, inplace=True)
        y_true['Depth'] = (1000 * y_true['Depth']).round().astype(int)
        y_true = y_true.set_index(['Well', 'Depth'])
        
        y_pred = y_pred.reindex(y_true.index)

        return y_pred.to_numpy()

In [None]:
# test_raw = pd.read_csv('test.csv', sep=';')
# data = Model().predict(test_raw)
# pd.DataFrame(data=data).value_counts()
