In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import pickle
from utility import data_modify, impute_nan, combine_features, augment_features
# suppress the chained indexing warnings entirely
pd.set_option('mode.chained_assignment',None)

In [2]:
lithology_numbers = {30000: 0,
                     65030: 1,
                     65000: 2,
                     80000: 3,
                     74000: 4,
                     70000: 5,
                     70032: 6,
                     88000: 7,
                     86000: 8,
                     99000: 9,
                     90000: 10,
                     93000: 11}

In [7]:
class Model(object):
    def __init__(self, model_file, scaler_file):
        # Load pre-trained model from file
        self.model = pickle.load(open(model_file, 'rb'))
        # Load a "pre-trained" scaler from file
        self.scaler = pickle.load(open(scaler_file, 'rb'))
        
    def _preprocess(self, features):
        # Method to be run before inference. Contains things like
        # stripping unwanted columns, replacing NaNs, and scaling 
        # or normalizing data
        feats=features[['DEPTH_MD', 'X_LOC', 'Y_LOC', 'Z_LOC', 'CALI', 'GR', 'RSHA', 'RMED', 'RDEP', 'RHOB', 
                       'NPHI', 'PEF', 'DTC', 'DTS', 'SP', 'ROP', 'BS']]
        feats=data_modify(feats)
        feats=feats.fillna(0)
        group, formation = impute_nan(feats, features['GROUP'], features['FORMATION'])
        feats=combine_features(feats, formation, group)
        feats=feats.drop(['DTS', 'ROP', 'BS'], axis=1)
        feats_aug = augment_features(feats, features['WELL'], features['DEPTH_MD'])

        return self.scaler.transform(feats_aug)
    
    def predict(self, features):
        # This function should be able to take in features in their
        # raw, unprocessed form as read from the file test.csv and
        # return predictions as an array integers of the same length
        X = self._preprocess(features)
        return self.model.predict(X)

In [8]:
model = Model('model.pkl', 'scaler.pkl')

In [9]:
open_test = pd.read_csv('D:/.jupyter/Machine_Predicted_Lithology/test.csv', sep=';', memory_map=True)

In [10]:
test_prediction = model.predict(open_test)

In [11]:
test_prediction

array([2., 2., 2., ..., 1., 0., 1.])

In [12]:
category_to_lithology = {y:x for x,y in lithology_numbers.items()}

In [13]:
test_prediction_for_submission = np.vectorize(category_to_lithology.get)(test_prediction)

In [14]:
test_prediction_for_submission

array([65000, 65000, 65000, ..., 65030, 30000, 65030])

In [None]:
np.savetxt('test_predictions.csv', test_prediction_for_submission, header='lithology', comments='', fmt='%i')