In [1]:
#Importing all the necessary libraries
import pandas as pd    
import numpy as np 
import sklearn
import sklearn.metrics
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from scipy import stats
import warnings 
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import StandardScaler
from joblib import dump, load

global random_state
random_state = 42
warnings.filterwarnings('ignore')

In [2]:
#Defining the custom objective function for xgboost model which take cares for rmse and mae
def obj(preds, dtrain):
    labels = dtrain.get_label()
    c = 1.5
    x = preds - labels
    grad = c * x /(np.abs(x) + c)
    hess = c ** 2 / (np.abs(x) + c) ** 2
    grad_rmse = x
    hess_rmse = 1.0
    grad_mae = np.array(x)
    grad_mae[grad_mae > 0] = 1.
    grad_mae[grad_mae <= 0] = -1.
    hess_mae = 1.0
    coef = [0.7, 0.15, 0.15]
    return coef[0] * grad + coef[1] * grad_rmse + coef[2] * grad_mae, coef[0] * hess + coef[1] * hess_rmse + coef[2] * hess_mae


In [3]:
#While loading the model pickle file, i need to run obj function, or else I am getting class import error
model = load('estimator.joblib')

In [4]:
def load_data():
    cat_name = ['cat'+str(x) for x in range(1,117)]
    cont_name = ['cont'+str(x) for x in range(1,15)]
    boxcox_cont = load('all_cont.joblib')

    xgb_excel = pd.read_excel('xgb_hist.xlsx',sheet_name='Interaction Depth 1')
    two_way = []
    for i in xgb_excel['Interaction'].values:
        temp = i.split('|')
        if 'cont' not in temp[0] and 'cont' not in temp[1]:
            two_way.append(i)
    two_way = two_way[:40]
    return two_way, cat_name, cont_name, boxcox_cont

def lexical_encoding(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

two_way, cat_name, cont_name, boxcox_cont = load_data()
def preprocess_data(row, cat_name=cat_name, cont_name=cont_name, two_way=two_way,
                    boxcox_cont=boxcox_cont):
    data = pd.DataFrame(columns=cat_name+cont_name)
    data.loc[0] = row
    for i in cont_name:
        temp_data = boxcox_cont[i][0].tolist()
        temp_data = [data[i].values[0]] + temp_data
        temp_data = np.array(temp_data)
        temp, _ = stats.boxcox(temp_data + 1)
        data.loc[0, i] = temp[0]
    drop_col = ['cat15', 'cat22', 'cat55', 'cat56', 'cat62', 'cat63', 'cat64', 'cat68', 'cat70']
    data.drop(drop_col, axis=1, inplace=True)
    cat_name2 = [x for x in cat_name if x not in drop_col]
    for i in cat_name2:
        data[i] = data[i].apply(lexical_encoding)
    for i in two_way:
        features = i.split('|')
        concat_name = i.replace('|','_')
        mul_name = i.replace('|','*')
        data[concat_name] = data[features[0]].astype(str)  + data[features[1]].astype(str) 
        data[concat_name] = data[concat_name].astype(int)
        data[mul_name] = data[features[0]].astype(float) * data[features[1]].astype(float)
    return data.values 

def decode_log_200(data):
    return np.exp(data)-200

In [5]:
#Prediciting on single row or multiple row
def predict_point(row, model=model):
    df = pd.DataFrame(columns=['row'+str(x) for x in range(201)])
    if len(row) == 1:
        df.loc[0] = preprocess_data(row[0])[0]
    else:
        for i,v in enumerate(row):
            df.loc[i] = preprocess_data(v)[0]
    pred = model.predict(df.astype(float))
    return [decode_log_200(x) for x in pred]

#Prediciting on single or multiple row and return the mae score
def predict_point_score(row, model=model):
    df = pd.DataFrame(columns=['row'+str(x) for x in range(201)])
    gt = []
    if len(row) == 1:
        gt.append(row[0][-1])
        df.loc[0] = preprocess_data(row[0][:-1])[0]
    else:
        for i,v in enumerate(row):
            gt.append(v[-1])
            df.loc[i] = preprocess_data(v[:-1])[0]
    pred = model.predict(df.astype(float))
    return mae(gt, [decode_log_200(x) for x in pred])


In [6]:
#Predicitng on single point
datapoint = ['A', 'B', 'A', 'B', 'A', 'A', 'A', 'A', 'B', 'A', 'B', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'A', 'D', 'B',
       'B', 'D', 'D', 'B', 'D', 'C', 'B', 'D', 'B', 'A', 'A', 'A', 'A',
       'A', 'D', 'B', 'C', 'E', 'A', 'C', 'T', 'B', 'G', 'A', 'A', 'I',
       'E', 'G', 'J', 'G', 'BU', 'BC', 'C', 'AS', 'S', 'A', 'O', 'LB',
       0.7263, 0.245921, 0.187583, 0.789639, 0.310061, 0.718367, 0.33506,
       0.3026, 0.67135, 0.8351, 0.569745, 0.594646, 0.822493, 0.714843,
       2213.18]

In [7]:
%%time 
predict_point([datapoint[:-1]])

CPU times: total: 1.89 s
Wall time: 1.73 s


[1788.7597788686137]

In [9]:
predict_point_score([datapoint])

424.42022113138614