In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import xgboost
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization




Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=e35f02e4ee82df6734370fbbf11b0c4d82404b14db8e6b1acac8fe8600c5a8e9
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from bayes_opt import BayesianOptimization
from sklearn.metrics import classification_report,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import pickle
import json


''' Starting preprocessing data '''

#Importing data
df = pd.read_csv("testdata.csv")

###drop id column
df = df.drop(['id'], axis=1)

###drop duplicate rows
df = df.drop_duplicates()

##split and remove '0' from the decimal part
def strip(x):
  return  str(x).split('.')[0]
df['amount'] = df['amount'].apply(lambda x: strip(x))
df['tax'] = df['tax'].apply(lambda x: strip(x))
df['vat_deducation'] = df['vat_deducation'].apply(lambda x: strip(x))

# def stripstamp(x):
#   return  str(x).split(' ')[0]
# df['billdate'] = df['billdate'].apply(lambda x: stripstamp(x))

##scale price amount by multiplying with a constant 100
def price(x):
  return  str(x*100)
df['price'] = df['price'].apply(lambda x: price(x))

df['billdate'] = pd.to_datetime(df['billdate'])
df['billdate_year'] = df['billdate'].dt.year
df['billdate_month'] = df['billdate'].dt.month
df['billdate_day'] = df['billdate'].dt.day

##fill missing values for unit column with null
df['unit'] = df["unit"].fillna("null")

##fill - product value with noproduct
df['product'] = df['product'].replace('-', "noproduct", regex=True)

##remove special characters in customername and product columns
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]

for char in spec_chars:
    df['product'] = df['product'].str.replace(char, ' ')
    df['customername'] = df['customername'].str.replace(char, ' ')


##remove white space and convert to lowecase
df['product'] = df['product'].str.replace(' ','').str.lower()
df['customername'] = df['customername'].str.replace(' ','').str.lower()

###remove invoice records with class frequency =1
df_filt = df.groupby('account_code').filter(lambda x: len(x) > 1 )

###subset features by dropping label column
features = df_filt.drop(['account_code'], axis=1)

# # find features with values corresponding to <1% of the total records and drop them
#this decreased the accuracy so did not use this approach
# counts = features.nunique()
# to_del = [i for i,v in enumerate(counts) if (float(v)/features.shape[0]*100) < 1]
# feat_to_del = features.columns[to_del]
# features = features.drop(feat_to_del,axis=1)

###convert non-numeric features to categorical and replace to codes
''' start :  encoding and generating pickle files for each feature and saving them'''
categorical_columns = features.select_dtypes(include = "object").columns
for column in categorical_columns:
    features[column] = features[column].astype('category')
    features[column + '_cat'] = features[column].cat.codes
    column_dict = features.set_index(column + '_cat').to_dict()[column]
    pickle.dump(column_dict, open(column + ".pickle", "wb"))
    features[column] = features[column + '_cat']

##drop duplicate columns as cat.codes are replaced into original feature columns
features_enc = features.drop(['product_cat', 'amount_cat','billdate', 'price_cat','unit_cat',
       'tax_cat', 'invoiceid_cat', 'bodyid_cat', 'invoicestatusid_cat',
       'customername_cat', 'currencycode_cat','vat_deducation_cat',
       'vat_status_cat'],axis=1)


###select target labels column and label encode
Y = df_filt['account_code']
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

##class to parse json dtypes
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        # if isinstance(obj, np.ndarray):
        #     return obj.tolist()
        return super(NpEncoder, self).default(obj)

with open('label_mapping.json', 'w') as fp:
    json.dump(le_name_mapping,fp, cls=NpEncoder)

###Split data into train, test datasets
x_train, x_test, y_train, y_test = train_test_split(features_enc, Y, stratify=Y, random_state=7, test_size=0.15)

##Invoke classifier
xgb_clf = XGBClassifier()

''' Fitting the model '''
xgb_clf.fit(x_train, y_train)
preds = xgb_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, preds)

print(classification_report(y_test, preds))

filename = 'XGBoost.pkl'
pickle.dump(xgb_clf, open(filename, 'wb'))

xgb_crf = pd.DataFrame(classification_report(y_true = y_test, y_pred = preds, output_dict=True)).transpose()
xgb_crf.to_csv('xgb_cr.csv', index = True)


#Converting the dataframe into XGBoost’s Dmatrix object
dtrain = xgboost.DMatrix(x_train, y_train)

''' Tuning the model '''
#Bayesian Optimization function for xgboost
#specify the parameters  to tune as keyword arguments
#Cross validating with the specified parameters in 5 folds and 100 iterations & Return mlogloss
def tune_xgb(max_depth,eta):
    params = {'max_depth': int(max_depth),
              'gamma': 0,
              'colsample_bytree' : 1,
              'subsample': 0.8,
              'eta': eta,
              'min_child_weight' : 1,
              'objective' : 'multi:softprob',
              'num_class' : 151,
              'eval_metric': 'mlogloss'}
    cv_result = xgboost.cv(params, dtrain, stratified=True, num_boost_round=100, nfold=5, early_stopping_rounds=10)
    print(cv_result)
    return -1.0 * cv_result['test-mlogloss-mean'].iloc[-1]


#Invoking the Bayesian Optimizer with the specified parameters to tune
xgb_bo = BayesianOptimization(tune_xgb, {'max_depth': (2, 8),
                                         'eta':(0.01,0.3)})

#performing Bayesian optimization for 20 iterations with 3 steps of random exploration with an #acquisition function of expected improvement
xgb_bo.maximize(n_iter=20, init_points=3, acq='ei')

#Extracting the best parameters
params = xgb_bo.max['params']
print(params)
#{'eta': 0.15892843654967728, 'max_depth': 6.073989506755048}

#Converting the max_depth and n_estimator values from float to int
params['max_depth']= int(params['max_depth'])

''' Fitting the tuned model '''
#Initialize an XGBClassifier with the tuned parameters and fit the training data
xgb_tuned = XGBClassifier(**params).fit(x_train, y_train)

predic = xgb_tuned.predict(x_test)

cnf_matrix = confusion_matrix(y_test, predic)

''' Writing classification report and saving the model as pickle file '''
print(classification_report(y_test, predic))
xgb_tune_cr = classification_report(y_test, predic)
filename1 = 'XGBoost_tuned.pkl'
pickle.dump(xgb_tuned, open(filename1, 'wb'))

xgb_tune_crf = pd.DataFrame(classification_report(y_true = y_test, y_pred = predic, output_dict=True)).transpose()
xgb_tune_crf.to_csv('xgb_tuned_cr.csv', index= True)

''' Data pre-processing, data engineering, data modelling and fitting is completed '''


# Unused code, can be used to transform with pipeline and featureunion if features have mixed datatypes(non-numeric)
# Make use of TextSelecter and NumberSelector to pick a single column
# class NumberSelector(BaseEstimator, TransformerMixin):
#     def __init__(self, key):
#         self.key = key
#     def fit(self, X, y=None):
#         return self
#     def transform(self, X):
#         return X[[self.key]]
#
#
# product =  Pipeline([('selector', NumberSelector(key='product')),('standard', None)])
# amount =  Pipeline([('selector', NumberSelector(key='amount')),('standard', None)])
# price =  Pipeline([('selector', NumberSelector(key='price')),('standard', None)])
# unit =  Pipeline([('selector', NumberSelector(key='unit')),('standard', None)])
# tax =  Pipeline([('selector', NumberSelector(key='tax')),('standard', None)])
# invoiceid =  Pipeline([('selector', NumberSelector(key='invoiceid')),('standard', None)])
# bodyid =  Pipeline([('selector', NumberSelector(key='bodyid')),('standard', None)])
# invoicestatusid =  Pipeline([('selector', NumberSelector(key='invoicestatusid')),('standard', None)])
# customername =  Pipeline([('selector', NumberSelector(key='customername')),('standard', None)])
# billdate =  Pipeline([('selector', NumberSelector(key='billdate')),('standard', None)])
# vat_deducation = Pipeline([('selector', NumberSelector(key='vat_deducation')),('standard', None)])
# vat_status =  Pipeline([('selector', NumberSelector(key='vat_status')),('standard', None)])
#
#
# Datafeatures = FeatureUnion([('product', product), ('amount',amount), ('price', price),
#                              ('unit', unit), ('tax', tax), ('invoiceid', invoiceid), ('bodyid', bodyid),
#                              ('invoicestatusid', invoicestatusid), ('customername', customername),
#                              ('billdate', billdate), ('vat_deducation', vat_deducation),
#                              ('vat_status', vat_status)])
#
# pipeline_xgb = Pipeline([
#     ('features', Datafeatures),
#     ('classifier', XGBClassifier(random_state=1001))])


|   iter    |  target   |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------


