In [1]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from scipy.signal import medfilt
import pickle as pk

# Auxiliar Functions

In [2]:
def Normalize_Data_by_Well(dataFrame,col='GR'):
    wells = dataFrame['WELL'].unique()
    values = []
    for well in wells:
        min_value = dataFrame[dataFrame['WELL'] == well][col].min()
        max_value = dataFrame[dataFrame['WELL'] == well][col].max()
        col_normalized = (dataFrame[dataFrame['WELL'] == well][col].values-min_value)/(max_value-min_value)
        values = values + list(col_normalized)
    return values

def Delta_Feature(dataFrame,col='GR',inverted=False):
    wells = dataFrame['WELL'].unique()
    values = []
    for well in wells:
        col_values = dataFrame[dataFrame['WELL'] == well][col].values
        col_values_ = np.array([col_values[0]]+list(col_values[:-1]))
        delta_col_values = col_values-col_values_
        if inverted:
            delta_col_values=-delta_col_values
        values = values + list(delta_col_values)
    return values

def Add_New_Features(dataFrame):
    data = dataFrame['RHOB'].values  
    data = ((154.497/data) - 57.261)
    dataFrame['Carbon_Index'] = data
    dataFrame['Normalized_RHOB'] = Normalize_Data_by_Well(dataFrame,col='RHOB')
    dataFrame['Normalized_GR'] = Normalize_Data_by_Well(dataFrame,col='GR')    
    dataFrame['Delta_DTC'] = Delta_Feature(dataFrame,col='DTC',inverted=True)
    dataFrame['Delta_RHOB'] = Delta_Feature(dataFrame,col='RHOB',inverted=True)    
    dataFrame['Delta_GR'] = Delta_Feature(dataFrame,col='GR',inverted=True)
    dataFrame['Delta_DEPTH_MD'] = Delta_Feature(dataFrame,col='DEPTH_MD')
    dataFrame['Delta_Carbon_Index'] = Delta_Feature(dataFrame,col='Carbon_Index')
    
    return dataFrame

def Fill_Data(dataFrame,fill_formation,fill_BS,fill_with_median):
    dataFrame.FORMATION = dataFrame.FORMATION.fillna(fill_formation)
    dataFrame.BS = dataFrame.BS.fillna(fill_BS)
    dataFrame.fillna(fill_with_median, inplace=True)
    dataFrame = pd.get_dummies(dataFrame, columns=['FORMATION'], drop_first=True)
    return dataFrame

In [3]:
lithology_numbers = {30000: 0,
                 65030: 1,
                 65000: 2,
                 80000: 3,
                 74000: 4,
                 70000: 5,
                 70032: 6,
                 88000: 7,
                 86000: 8,
                 99000: 9,
                 90000: 10,
                 93000: 11}

# Submission

In [4]:
class Model(object):
    def __init__(self, model_files, fill_information='fill_information.pkl'):
        # Load pre-trained models from file
        self.model_files=model_files
        
        #self.model=[]
        #for i,file in enumerate(model_files):
        #    self.model.append(pk.load(open(file, 'rb')))
        
        # Load a "filles-information" from file
        self.fill_information = pk.load(open(fill_information, 'rb'))
        
    def _preprocess(self, features):
        # Method to be run before inference.
        features = Add_New_Features(features)
        features = Fill_Data(features,self.fill_information[0],self.fill_information[1],self.fill_information[2])
        complete_features = pd.DataFrame(columns=self.fill_information[-1], 
                                   data=np.zeros((features.shape[0], len(self.fill_information[-1]))))
        for col in complete_features.columns:
            if col in features.columns:
                complete_features[col] = features[col]
        
        return complete_features
    def corrections(self,features):
        for well in features['WELL'].unique():
            values_by_facies = features[features['WELL']==well]['Prediction'].value_counts()
            values_by_facies = values_by_facies[values_by_facies < 20].index
            for value in values_by_facies :
                v = features[(features['WELL']==well)]['Prediction'].values
                v[v==value] = features[features['WELL']==well]['Prediction'].value_counts().index[0].astype('int64')
                features.loc[features['WELL']==well,'Prediction'] = v
            
            features.loc[features['WELL']==well,'Prediction'] = medfilt(features[features['WELL']==well]['Prediction'].values,
                                                                        kernel_size=3)
        return features['Prediction'].values
    def predict(self, features):
        # This function should be able to take in features in their
        # raw, unprocessed form as read from the file test.csv and
        # return predictions as an array integers of the same length        
        features_base = features.copy()
        X = self._preprocess(features)
        y_t = np.zeros((X.shape[0], 12))
        for model_file in self.model_files:
            model = pk.load(open(model_file, 'rb'))        
            y_t += model.predict_proba(X)
            del model # delete object to clear memory
        predictions = np.argmax(y_t, axis=-1)
        category_to_lithology = {y:x for x,y in lithology_numbers.items()}
        test_prediction_for_submission = np.vectorize(category_to_lithology.get)(predictions)

        features_base['Prediction'] = test_prediction_for_submission        
        #return predictions       
        return self.corrections(features_base)

# Load TestData

In [5]:
open_test_features = pd.read_csv('test.csv', sep=';')

In [6]:
model = Model(['trained_model_1.pkl','trained_model_2.pkl',
               'trained_model_3.pkl','trained_model_4.pkl',
               'trained_model_5.pkl'], 'fill_information.pkl')

In [7]:
test_prediction = model.predict(open_test_features)

In [8]:
np.savetxt('test_predictions.csv', test_prediction, header='lithology', comments='', fmt='%i')