In [1]:
import numpy as np
import pandas as pd
import pickle
import math

In [6]:
class Model(object):
    def __init__(self, df):
        self.df = df
        # Load columns from train set
        self.x_columns = pickle.load( open(r'columns.pkl', "rb"))
        # Load median of features from train set
        self.dfmedian = pickle.load( open(r'df_median.pkl', "rb"))
        
        # make feauture engineering
        X = self.feature_engineering(df)
        
        # make pre-processing
        self.X = self._preprocess(X)
        
        
    def feature_engineering(self, df):

        # Calculate gamma ray index (IGR)
        list_igr = ''
        for well in df['WELL'].unique():
            df_aux = df[df['WELL'] == well]
            GR_max = df_aux['GR'].max()
            GR_min = df_aux['GR'].min()
            df_aux['IGR'] = df_aux['GR'].apply(lambda x: (x-GR_min)/(GR_max-GR_min))
            if list_igr == '':
                list_igr = df_aux['IGR'].values
            else:
                list_igr = np.concatenate([list_igr,df_aux['IGR'].values])

        df['IGR'] = list_igr    


        # Calculate the medium porosity (PHIA)
        data = df.copy()
        data = data[['RHOB', 'NPHI']]

        def phia(rhob,nphi):
            if not(str(rhob).isalpha()) and not(str(nphi).isalpha()):
                return math.sqrt(((rhob*rhob)+(nphi*nphi))/2)
            else:
                return np.nan

        data['PHIA'] = data.apply(lambda row: phia(row[0],row[1]),axis=1)
        df['PHIA'] = data['PHIA']


        # Calculate total organic carbon (TOC)
        data = df.copy()
        data = data[['RHOB']]

        data['TOC'] = data.apply(lambda x: ((154.497/x) - 57.261))
        df['TOC'] = data['TOC']


        #Calculate apparent matrix grain density (RHOMMA)
        data = df.copy()
        data = data[['RHOB', 'PHIA']]

        def rhomma(rhob,phia):
            if not(str(rhob).isalpha()) and not(str(phia).isalpha()):
                return ((rhob*phia)/(1-phia))
            else:
                return np.nan

        data['RHOMMA'] = data.apply(lambda row: rhomma(row[0],row[1]),axis=1)
        df['RHOMMA'] = data['RHOMMA']


        # Ratio GR/RHOB
        data = df.copy()
        data = data[['GR', 'RHOB']]

        def ratiogr(gr,rhob):
            if not(str(gr).isalpha()) and not(str(rhob).isalpha()):
                return (gr/rhob)
            else:
                return np.nan

        data['GR/RHOB'] = data.apply(lambda row: ratiogr(row[0],row[1]),axis=1)
        df['GR_RHOB'] = data['GR/RHOB']


        # Ratio PE/RHOB
        data = df.copy()
        data = data[['PEF', 'RHOB']]

        def ratiopef(pef,rhob):
            if not(str(pef).isalpha()) and not(str(rhob).isalpha()):
                return (pef/rhob)
            else:
                return np.nan
        data['PEF/RHOB'] = data.apply(lambda row: ratiogr(row[0],row[1]),axis=1)
        df['PEF_RHOB'] = data['PEF/RHOB']

        return df
        
        
    def _preprocess(self, df):
        
        # select features
        df = df[['DEPTH_MD', 'FORMATION', 'CALI', 'RSHA', 'RMED', 'RDEP', 'RHOB', 'GR', 'NPHI', 'PEF', 'DTC', 'SP', 'BS',
                 'DRHO','X_LOC','Y_LOC',
                 'IGR','PHIA','TOC','GR_RHOB','PEF_RHOB','RHOMMA' ]]
        
        # handle missing values
        df.FORMATION = df.FORMATION.fillna('Utsira Fm.')
        df.BS = df.BS.fillna(12.250001).values
        df.fillna(self.dfmedian, inplace=True)
        
        # one-hot encoding for categorical features
        df = pd.get_dummies(df, columns=['FORMATION'], drop_first=True)
        
        # sort features
        df_aux = pd.DataFrame(columns=self.x_columns, data=np.zeros((df.shape[0], len(self.x_columns))))
        for col in df_aux.columns:
            if col in df.columns:
                df_aux[col] = df[col].values

        return df_aux
        
    def predict(self, postproc=True):
        
        yt = np.zeros((self.X.shape[0], 12))

        for ii in range(5): # change
            clf = pickle.load(open('model' + str(ii) + '.pkl', 'rb'))
            yt += clf.predict_proba(self.X)
            print("Model: ", ii)
            
        lithology_numbers = {30000: 0, 65030: 1, 65000: 2, 80000: 3, 74000: 4, 70000: 5, 70032: 6, 88000: 7, 86000: 8,
                 99000: 9, 90000: 10, 93000: 11}
            
        test_prediction = np.argmax(yt, axis=-1)
        category_to_lithology = {y:x for x,y in lithology_numbers.items()}
        test_prediction_for_submission = np.vectorize(category_to_lithology.get)(test_prediction)
        
        if postproc:
            test_prediction_for_submission = self._postprocess(test_prediction_for_submission)
        
        return test_prediction_for_submission
    
    def _postprocess(self, test_prediction, wdw=20):
        
        print("post-processing...")
        
        def Update(values):
            old = values[0]
            for i in range(len(values)):
                if i == len(values)-1:
                    if (values[i] != old):
                        values[i] = old
                    break
                if (old == values[i+1]) and (values[i] != old):
                    values[i] = old
                old = values[i]
            return values

        df_test = self.df
        df_test['Facies'] = test_prediction

        for we in df_test['WELL'].unique():
            a = df_test[df_test['WELL']==we]['Facies'].value_counts()
            max_value =a.index[0].astype('int64')
            a = a[a < wdw].index
            for a_ in a :
                v = df_test[(df_test['WELL']==we)]['Facies'].values
                v[v==a_] = max_value
                df_test.loc[df_test['WELL']==we,'Facies'] = v

            values = df_test[df_test['WELL']==we]['Facies'].values    
            values = Update(values)
            df_test.loc[df_test['WELL']==we,'Facies'] = values
            
        return df_test['Facies'].values

In [7]:
open_test_features = pd.read_csv('test.csv', sep=';')

In [8]:
model2 = Model(open_test_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
y_pr = model2.predict(postproc=False)
y_pr

Model:  0
Model:  1
Model:  2
Model:  3
Model:  4


array([65000, 65000, 65000, ..., 30000, 30000, 30000])

In [10]:
y_pr_pp20 = model2._postprocess(y_pr, 20)

post-processing...


In [None]:
np.savetxt('sub.csv', y_pr_pp20, header='lithology', comments='', fmt='%i')