# Featurizer Using Custom Scikit-Learn Transformer For SageMaker Inference Pipeline



In [203]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes

color = sns.color_palette()

%matplotlib inline

eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None

In [204]:
# Read the train and test dataset and check the top few lines ##
labeled_raw_df = pd.read_csv("iso20022-data/labeled_data.csv")

fts=[
 'y_target',   
 'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_Dbtr_PstlAdr_Ctry',  
 'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_Cdtr_PstlAdr_Ctry', 
 'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_DbtCdtRptgInd', 
 'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_Authrty_Ctry', 
 'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_Dtls_Cd',
 'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_InstrForNxtAgt_InstrInf',
]

# New data frame with selected features
selected_df = labeled_raw_df[fts]
    
selected_df.head()

Unnamed: 0,y_target,Document_FIToFICstmrCdtTrf_CdtTrfTxInf_Dbtr_PstlAdr_Ctry,Document_FIToFICstmrCdtTrf_CdtTrfTxInf_Cdtr_PstlAdr_Ctry,Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_DbtCdtRptgInd,Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_Authrty_Ctry,Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_Dtls_Cd,Document_FIToFICstmrCdtTrf_CdtTrfTxInf_InstrForNxtAgt_InstrInf
0,Success,TH,IE,,,,
1,Success,IN,US,DEBT,IN,13.P1302,
2,Success,TH,GB,,,,
3,Success,TH,GB,,,,
4,Failure,GB,IN,,,,


In [205]:
# Rename columns
selected_df = selected_df.rename(columns={
    'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_Dbtr_PstlAdr_Ctry': 'Dbtr_PstlAdr_Ctry',
    'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_Cdtr_PstlAdr_Ctry': 'Cdtr_PstlAdr_Ctry',
    'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_DbtCdtRptgInd': 'RgltryRptg_DbtCdtRptgInd',
    'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_Authrty_Ctry': 'RgltryRptg_Authrty_Ctry',
    'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_RgltryRptg_Dtls_Cd': 'RgltryRptg_Dtls_Cd',
    'Document_FIToFICstmrCdtTrf_CdtTrfTxInf_InstrForNxtAgt_InstrInf': 'InstrForNxtAgt',
})

selected_df.head()

Unnamed: 0,y_target,Dbtr_PstlAdr_Ctry,Cdtr_PstlAdr_Ctry,RgltryRptg_DbtCdtRptgInd,RgltryRptg_Authrty_Ctry,RgltryRptg_Dtls_Cd,InstrForNxtAgt
0,Success,TH,IE,,,,
1,Success,IN,US,DEBT,IN,13.P1302,
2,Success,TH,GB,,,,
3,Success,TH,GB,,,,
4,Failure,GB,IN,,,,


In [206]:
# Categorical data transformation.

categorical_fts=[
 'Dbtr_PstlAdr_Ctry', 
 'Cdtr_PstlAdr_Ctry',
 'RgltryRptg_DbtCdtRptgInd',    
 'RgltryRptg_Authrty_Ctry', 
 'RgltryRptg_Dtls_Cd'
]

integer_fts=[
    
]

numeric_fts=[
    
]

text_fts=[
# 'InstrForNxtAgt'
]

# Convert categorical features to categorical data type.
feature_categories = {}
for col in categorical_fts:
    selected_df[col] = pd.Categorical(selected_df[col])
    feature_categories[col] = selected_df[col].cat.categories
    #print(f"feature: {col}, Categories: {dict(enumerate(selected_df[col].cat.categories))}")

print(f"feature_categories: {col_categories}")
# for col in categorical_fts:
#     print(f"Retrieved feature: {col}, Categories: {dict(enumerate(feature_categories[col]))}")

# # Convert from original feature values to categorical values
# for col in categorical_fts:
#     selected_df[col] = selected_df[col].cat.codes

# # Convert categorical features to categorical data type.
# for col in categorical_fts:
#     print('Attempt 2 to convert')
#     selected_df[col] = pd.Categorical(selected_df[col], feature_categories[col])
#     print(f"feature: {col}, Categories: {dict(enumerate(selected_df[col].cat.categories))}")
    
# for col in categorical_fts:
#     print('Attempt 2 to set codes')
#     selected_df[col] = selected_df[col].cat.codes

    
for col in integer_fts:
    selected_df[col] = selected_df[col].astype(str).astype('int64')
    
for col in numeric_fts:
    selected_df[col] = selected_df[col].astype(str).astype('float64')

for col in text_fts:
    selected_df[col] = selected_df[col].astype(str).astype('string')
    
selected_df.dtypes

feature_categories: {'Dbtr_PstlAdr_Ctry': Index(['CA', 'GB', 'IE', 'IN', 'MX', 'TH', 'US'], dtype='object'), 'Cdtr_PstlAdr_Ctry': Index(['CA', 'GB', 'IE', 'IN', 'MX', 'TH', 'US'], dtype='object'), 'RgltryRptg_DbtCdtRptgInd': Index(['CRED', 'DEBT'], dtype='object'), 'RgltryRptg_Authrty_Ctry': Index(['IN'], dtype='object'), 'RgltryRptg_Dtls_Cd': Index(['00.00000', '00.P0006', '00.P0008', '13.P1301', '13.P1302'], dtype='object')}


y_target                      object
Dbtr_PstlAdr_Ctry           category
Cdtr_PstlAdr_Ctry           category
RgltryRptg_DbtCdtRptgInd    category
RgltryRptg_Authrty_Ctry     category
RgltryRptg_Dtls_Cd          category
InstrForNxtAgt                object
dtype: object

In [207]:
selected_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1389 entries, 0 to 1388
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   y_target                  1389 non-null   object  
 1   Dbtr_PstlAdr_Ctry         1389 non-null   category
 2   Cdtr_PstlAdr_Ctry         1389 non-null   category
 3   RgltryRptg_DbtCdtRptgInd  327 non-null    category
 4   RgltryRptg_Authrty_Ctry   327 non-null    category
 5   RgltryRptg_Dtls_Cd        327 non-null    category
 6   InstrForNxtAgt            359 non-null    object  
dtypes: category(5), object(2)
memory usage: 29.7+ KB


In [208]:
selected_df

Unnamed: 0,y_target,Dbtr_PstlAdr_Ctry,Cdtr_PstlAdr_Ctry,RgltryRptg_DbtCdtRptgInd,RgltryRptg_Authrty_Ctry,RgltryRptg_Dtls_Cd,InstrForNxtAgt
0,Success,TH,IE,,,,
1,Success,IN,US,DEBT,IN,13.P1302,
2,Success,TH,GB,,,,
3,Success,TH,GB,,,,
4,Failure,GB,IN,,,,
...,...,...,...,...,...,...,...
1384,Success,CA,IN,CRED,IN,00.00000,/REG/15.X0002 FDI in Aggriculture
1385,Failure,TH,GB,,,,/SVC/It is to be delivered in three days. Grea...
1386,Failure,CA,IN,CRED,IN,00.00000,/REG/99.A27865
1387,Success,IN,GB,DEBT,IN,00.P0008,


In [209]:
X_train, X_test, y_train, y_test = train_test_split(selected_df, selected_df['y_target'], test_size=0.20, random_state=299, shuffle=True)
train_df = X_train
test_df = X_test

print("Number of rows in train dataset : ",train_df.shape[0])
print("Number of rows in test dataset : ",test_df.shape[0])

Number of rows in train dataset :  1111
Number of rows in test dataset :  278


In [210]:
train_df

Unnamed: 0,y_target,Dbtr_PstlAdr_Ctry,Cdtr_PstlAdr_Ctry,RgltryRptg_DbtCdtRptgInd,RgltryRptg_Authrty_Ctry,RgltryRptg_Dtls_Cd,InstrForNxtAgt
500,Success,IN,US,DEBT,IN,13.P1301,
746,Success,GB,US,,,,
1107,Success,IN,GB,DEBT,IN,00.P0006,
81,Success,IE,MX,,,,
956,Success,TH,GB,,,,
...,...,...,...,...,...,...,...
555,Success,CA,TH,,,,
404,Failure,CA,IE,,,,/SVC/tcrgzqml6248dmnb
53,Failure,IN,GB,DEBT,IN,13.P1302,/SVC/
714,Success,US,GB,,,,/SVC/It is to be delivered in one day. Two day...


In [211]:
test_df

Unnamed: 0,y_target,Dbtr_PstlAdr_Ctry,Cdtr_PstlAdr_Ctry,RgltryRptg_DbtCdtRptgInd,RgltryRptg_Authrty_Ctry,RgltryRptg_Dtls_Cd,InstrForNxtAgt
1064,Success,IE,GB,,,,
1149,Success,IE,TH,,,,
1154,Success,CA,IE,,,,
1291,Success,IN,TH,DEBT,IN,00.P0008,
1045,Success,GB,MX,,,,
...,...,...,...,...,...,...,...
394,Failure,GB,IN,CRED,IN,13.P1302,/REG/00.P0008
877,Failure,IN,TH,,,,
1019,Success,GB,IE,,,,
650,Success,US,GB,,,,


# Feature Engineering

## Custom Transformer for Text Preprocessing

In [212]:
import numpy as np 
import pandas as pd
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
import string
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, Binarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes
#from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction import _stop_words
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


eng_stopwords = _stop_words.ENGLISH_STOP_WORDS
print(eng_stopwords)

def create_meta_text_features(text_df, text_feature_name):
    # Number of characters in the text
    text_df[text_feature_name + "_num_chars"] = text_df[text_feature_name].apply(lambda x: len(str(x)))

    # Number of words in the text
    text_df[text_feature_name + "_num_words"] = text_df[text_feature_name].apply(lambda x: len(str(x).split()))
    
    # Average length of the words in the text
    text_df[text_feature_name + "_mean_word_len"] = text_df[text_feature_name].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

    # Number of unique words in the text
    text_df[text_feature_name + "_num_unique_words"] = text_df[text_feature_name].apply(lambda x: len(set(str(x).split())))

    # Number of stopwords in the text
    text_df[text_feature_name + "_num_stopwords"] = text_df[text_feature_name].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    
    # Number of punctuations in the text
    text_df[text_feature_name + "_num_punctuations"] = text_df[text_feature_name].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    
    # Number of upper case words in the text
    text_df[text_feature_name + "_num_words_upper"] = text_df[text_feature_name].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

    return text_df

# Fit naive bayes model on input data
def fit_naive_bayes_model(train_X, train_y):
    model = naive_bayes.MultinomialNB()
    model.fit(train_X, train_y)
    
    return model

# Fit the tfidf vectorizer on words in text_feature_name and returns the model
def fit_word_tfidf_vectorizer(text_df, text_feature_name):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
    tfidf_vectorizer.fit(text_df)

    return tfidf_vectorizer

# Fit the tfidf vectorizer on characters in text_feature_name and returns the model
def fit_char_tfidf_vectorizer(text_df, text_feature_name):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
    tfidf_vectorizer.fit(text_df)

    return tfidf_vectorizer

# Fit a count vectorizer on words in text_feature_name and returns the model
def fit_word_count_vectorizer(text_df, text_feature_name):
    count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,3))
    count_vectorizer.fit(text_df)
   
    return count_vectorizer

# Fit a count vectorizer on characters in text_feature_name and returns the model
def fit_char_count_vectorizer(text_df, text_feature_name):
    count_vectorizer = CountVectorizer(ngram_range=(1,7), analyzer='char')
    count_vectorizer.fit(text_df)
   
    return count_vectorizer

# Scikit-learn custom transformer
class TextFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name):
        print(f'TextFeatureTransformer init method, feature_name: {feature_name}')
        self.feature_name = feature_name
        
    def fit(self, X, y):
        print(f'TextFeatureTransformer.fit() method, feature_name: {self.feature_name}')

        # fill NaN with 'none'
        X[self.feature_name].fillna('none', inplace=True)
        
        # Get features as a list
        text_feature_list = X[self.feature_name].values.tolist()

        # create/fit TfidfVectorizer for words in the text
        self.word_tfidf_vectorizer = fit_word_tfidf_vectorizer(X, y)
        # Train naive bayes classifier on word tfidf vector
        X_word_tfidf_vec = self.word_tfidf_vectorizer.transform(text_feature_list)
        self.word_tfidf_nv_classifier = fit_naive_bayes_model(X_word_tfidf_vec, y)
        
        # create/fit TfidfVectorizer for characters in the text
        self.char_tfidf_vectorizer = fit_char_tfidf_vectorizer(X, y)
        # Train naive bayes classifier on character tfidf vector
        X_char_tfidf_vec = self.char_tfidf_vectorizer.transform(text_feature_list)
        self.char_tfidf_nv_classifier = fit_naive_bayes_model(X_char_tfidf_vec, y)

        # create/fit CountVectorizer for words in the text
        self.word_count_vectorizer = fit_word_count_vectorizer(X, y)
        # Train naive bayes classifier on word count vector
        X_word_count_vec = self.word_count_vectorizer.transform(text_feature_list)
        self.word_count_nv_classifier = fit_naive_bayes_model(X_word_count_vec, y)

        # create/fit CountVectorizer for characters in the text
        self.char_count_vectorizer = fit_char_count_vectorizer(X, y)
        # Train naive bayes classifier on character count vector
        X_char_count_vec = self.char_count_vectorizer.transform(text_feature_list)
        self.char_count_nv_classifier = fit_naive_bayes_model(X_char_count_vec, y)

        return self

    def transform(self, X, y=None):
        print(f'TextFeatureTransformer.transform() method, feature_name: {self.feature_name}')

        print(f"X shape before transform actions:{X.shape}")
        
        # Feature that is being transformed
        text_feature_name = self.feature_name

        # fill NaN with 'none'
        X[self.feature_name].fillna('none', inplace=True)
        
        # add meta text features
        create_meta_text_features(X, self.feature_name)

        # Get features as a list
        text_feature_list = X[self.feature_name].values.tolist()

        # Add the word tfidf based prediction probabilities for Failure or Success from text as new features
        word_tfidf_vec = self.word_tfidf_vectorizer.transform(text_feature_list)
        word_tfidf_y_pred_proba = self.word_tfidf_nv_classifier.predict_proba(word_tfidf_vec)
        X[[text_feature_name + "_nb_tfidf_word_failure", text_feature_name + "_nb_tfidf_word_success"]] = word_tfidf_y_pred_proba
        
        # Add the character tfidf based prediction probabilities for Failure or Success from text as new features
        char_tfidf_vec = self.char_tfidf_vectorizer.transform(text_feature_list)
        char_tfidf_y_pred_proba = self.char_tfidf_nv_classifier.predict_proba(char_tfidf_vec)
        X[[text_feature_name + "_nb_tfidf_char_failure", text_feature_name + "_nb_tfidf_char_success"]] = char_tfidf_y_pred_proba
        
        # Add the word count based prediction probabilities for Failure or Success from text as new features
        word_count_vec = self.word_count_vectorizer.transform(text_feature_list)
        word_count_y_pred_proba = self.word_count_nv_classifier.predict_proba(word_count_vec)
        X[[text_feature_name + "_nb_word_count_failure", text_feature_name + "_nb_word_count_success"]] = word_count_y_pred_proba 
        
        # Add the character count based prediction probabilities for Failure or Success from text as new features
        char_count_vec = self.char_count_vectorizer.transform(text_feature_list)
        char_count_y_pred_proba = self.char_count_nv_classifier.predict_proba(char_count_vec)
        X[[text_feature_name + "_nb_char_count_failure", text_feature_name + "_nb_char_count_success"]] = char_count_y_pred_proba 

        print(f"X Shape after adding char_count_y_pred_proba:{X.shape}")
        #print(f"X after adding char_count_y_pred_proba:{X}")
        
        # Drop text feature before training prediction
        X.drop([text_feature_name], axis=1, inplace=True)
        print(f"X shape after dropping text feature:{X.shape}")
        #print(f"X after dropping text feature:{X}")
        
        return X

class CategoricalFeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_features):
        print(f'CategoricalFeatureTransformer init method, categorical_features: {categorical_features}')
        self.categorical_features = categorical_features
        
    def fit(self, X, y=None):
        print(f'CategoricalFeatureTransformer.fit() method, categorical_features: {self.categorical_features}')

        categorical_features = self.categorical_features

        # Convert categorical features to categorical data and unique Index (integer value).
        self.feature_categories = {}
        for col in categorical_features:
            X[col] = pd.Categorical(X[col])
            # remember categories, this will be needed in transform
            self.feature_categories[col] = X[col].cat.categories
        
        print(f"feature_categories: {self.feature_categories}")
           
        return self

    def transform(self, X, y=None):
        print(f'CategoricalFeatureTransformer.transform() method, categorical_features: {self.categorical_features}')

        print(f"CategoricalFeatureTransformer X shape before transform actions:{X.shape}")

        categorical_features = self.categorical_features

        # Convert categorical features to categorical type and unique Index (integer value) from fit 
        # Important when used during inference, important to keep index same as in fit (in training step)
        for col in categorical_features:
            X[col] = pd.Categorical(X[col], self.feature_categories[col])

        # Convert from original string feature values to categorical integer values
        for col in categorical_features:
            X[col] = X[col].cat.codes

        return X

frozenset({'along', 'enough', 'go', 'latter', 'itself', 'over', 'very', 'hereby', 'within', 'own', 'neither', 'move', 'whether', 'someone', 'thus', 'detail', 'who', 'three', 'it', 'everything', 'eight', 'eleven', 'herself', 'mill', 'moreover', 'became', 'due', 'thick', 'your', 'front', 'of', 'such', 'almost', 'please', 'thin', 'either', 'several', 'by', 'about', 'me', 'while', 'couldnt', 'two', 'afterwards', 'and', 'get', 'most', 'full', 'both', 'eg', 'with', 'de', 'latterly', 'already', 'among', 'anyone', 'its', 'anything', 'ever', 'should', 'we', 'yours', 'found', 'why', 'cry', 'often', 'whom', 'onto', 'under', 'once', 'no', 'etc', 'will', 'since', 'beside', 'cannot', 'beyond', 'beforehand', 'mine', 'themselves', 're', 'even', 'any', 'hereupon', 'perhaps', 'former', 'somehow', 'for', 'rather', 'herein', 'something', 'whole', 'none', 'anywhere', 'to', 'everyone', 'whoever', 'nowhere', 'can', 'nine', 'hundred', 'if', 'without', 'cant', 'except', 'into', 'back', 'besides', 'ltd', 'mostl

## Scikit-Learn Pipeline

With custom text transformer.

In [213]:
from sklearn.preprocessing import FunctionTransformer

orig_features = [
 'Dbtr_PstlAdr_Ctry', 
 'Cdtr_PstlAdr_Ctry',
 'RgltryRptg_DbtCdtRptgInd',    
 'RgltryRptg_Authrty_Ctry', 
 'RgltryRptg_Dtls_Cd'
]
added_features = [
    "InstrForNxtAgt"+"_num_chars",
    "InstrForNxtAgt"+"_num_words",
    "InstrForNxtAgt"+"_mean_word_len",
    "InstrForNxtAgt"+"_num_unique_words",
    "InstrForNxtAgt"+"_num_stopwords",
    "InstrForNxtAgt"+"_num_punctuations",
    "InstrForNxtAgt"+"_num_words_upper",
    "InstrForNxtAgt"+"_nb_tfidf_word_failure",
    "InstrForNxtAgt"+"_nb_tfidf_word_success",
    "InstrForNxtAgt"+"_nb_tfidf_char_failure",
    "InstrForNxtAgt"+"_nb_tfidf_char_success",
    "InstrForNxtAgt"+"_nb_word_count_failure",
    "InstrForNxtAgt"+"_nb_word_count_success",
    "InstrForNxtAgt"+"_nb_char_count_failure",
    "InstrForNxtAgt"+"_nb_char_count_success"
]

combined_features = orig_features + added_features
print(f"Length of combined_features: {len(combined_features)}")

numeric_transformer = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

# categorical_transformer = make_pipeline(
#     SimpleImputer(strategy="constant", fill_value="missing")
# )

categorical_transformer = make_pipeline(
    CategoricalFeatureTransformer(orig_features)
)

preprocessor = ColumnTransformer(
    transformers=[
#        ("num", numeric_transformer, make_column_selector(dtype_include="float64")),
         #("passthrough", 'passthrough', orig_features),
        #("cat", categorical_transformer, make_column_selector(dtype_include="category")),
        ("cat", categorical_transformer, make_column_selector(dtype_include="category")),
        ("text_custom", TextFeatureTransformer('InstrForNxtAgt'), ['InstrForNxtAgt']),
#       ("pandas_df", FunctionTransformer(lambda x: pd.DataFrame(x, columns = combined_features)))
    ]
)

# fit returns transformer
#column_tranformer = preprocessor.fit(train_df, y_train)

# fit_transform return numpy array, map back to pandas dataframe with column names
new_np = preprocessor.fit_transform(train_df, y_train)
print(f"Shape of new numpy array after column transformation: {new_np.shape}")
new_df = pd.DataFrame(new_np, index=train_df.index, columns=combined_features)
new_df

Length of combined_features: 20
CategoricalFeatureTransformer init method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
TextFeatureTransformer init method, feature_name: InstrForNxtAgt
CategoricalFeatureTransformer init method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
CategoricalFeatureTransformer.fit() method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
feature_categories: {'Dbtr_PstlAdr_Ctry': Index(['CA', 'GB', 'IE', 'IN', 'MX', 'TH', 'US'], dtype='object'), 'Cdtr_PstlAdr_Ctry': Index(['CA', 'GB', 'IE', 'IN', 'MX', 'TH', 'US'], dtype='object'), 'RgltryRptg_DbtCdtRptgInd': Index(['CRED', 'DEBT'], dtype='object'), 'RgltryRptg_Authrty_Ctry': Index(['IN'], dtype='object'), 'RgltryRptg_Dtls_Cd': I

Unnamed: 0,Dbtr_PstlAdr_Ctry,Cdtr_PstlAdr_Ctry,RgltryRptg_DbtCdtRptgInd,RgltryRptg_Authrty_Ctry,RgltryRptg_Dtls_Cd,InstrForNxtAgt_num_chars,InstrForNxtAgt_num_words,InstrForNxtAgt_mean_word_len,InstrForNxtAgt_num_unique_words,InstrForNxtAgt_num_stopwords,InstrForNxtAgt_num_punctuations,InstrForNxtAgt_num_words_upper,InstrForNxtAgt_nb_tfidf_word_failure,InstrForNxtAgt_nb_tfidf_word_success,InstrForNxtAgt_nb_tfidf_char_failure,InstrForNxtAgt_nb_tfidf_char_success,InstrForNxtAgt_nb_word_count_failure,InstrForNxtAgt_nb_word_count_success,InstrForNxtAgt_nb_char_count_failure,InstrForNxtAgt_nb_char_count_success
500,3.0,6.0,1.0,0.0,3.0,4.0,1.0,4.000000,1.0,1.0,0.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.010482,0.989518
746,1.0,6.0,-1.0,-1.0,-1.0,4.0,1.0,4.000000,1.0,1.0,0.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.010482,0.989518
1107,3.0,1.0,1.0,0.0,1.0,4.0,1.0,4.000000,1.0,1.0,0.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.010482,0.989518
81,2.0,4.0,-1.0,-1.0,-1.0,4.0,1.0,4.000000,1.0,1.0,0.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.010482,0.989518
956,5.0,1.0,-1.0,-1.0,-1.0,4.0,1.0,4.000000,1.0,1.0,0.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.010482,0.989518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,0.0,5.0,-1.0,-1.0,-1.0,4.0,1.0,4.000000,1.0,1.0,0.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.010482,0.989518
404,0.0,2.0,-1.0,-1.0,-1.0,21.0,1.0,21.000000,1.0,0.0,2.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.456546,0.543454
53,3.0,1.0,1.0,0.0,4.0,5.0,1.0,5.000000,1.0,0.0,2.0,1.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.348046,0.651954
714,6.0,1.0,-1.0,-1.0,-1.0,105.0,21.0,4.047619,19.0,9.0,4.0,0.0,0.208821,0.791179,0.208821,0.791179,0.208821,0.791179,0.955458,0.044542


## XGBoost In the Scikit Learn Pipeline



In [214]:
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost import XGBClassifier

# Define model
# Init classifier
xgb_cl = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=True, eval_metric='error')

# ### Without pipeline
# # Fit and transform training datat using column transformer
# X_train_transformed = preprocessor.fit_transform(train_df, y_train)

# # Fit
# xgb_cl.fit(X_train_transformed, y_train)

# # Predict
# X_test_preprocessed = preprocessor.transform(X_test)
# preds = xgb_cl.predict(X_test_preprocessed)
### Without pipeline END

### With pipeline
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', xgb_cl)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(train_df, y_train)

# Get predictions using the pipeline, it calls transformer to transform before calling predict.
preds = my_pipeline.predict(X_test)
### With pipeline END

# Evaluate the model
# Accuracy Score
accuracy_score(y_test, preds)


CategoricalFeatureTransformer init method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
CategoricalFeatureTransformer.fit() method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
feature_categories: {'Dbtr_PstlAdr_Ctry': Index(['CA', 'GB', 'IE', 'IN', 'MX', 'TH', 'US'], dtype='object'), 'Cdtr_PstlAdr_Ctry': Index(['CA', 'GB', 'IE', 'IN', 'MX', 'TH', 'US'], dtype='object'), 'RgltryRptg_DbtCdtRptgInd': Index(['CRED', 'DEBT'], dtype='object'), 'RgltryRptg_Authrty_Ctry': Index(['IN'], dtype='object'), 'RgltryRptg_Dtls_Cd': Index(['00.00000', '00.P0006', '00.P0008', '13.P1301', '13.P1302'], dtype='object')}
CategoricalFeatureTransformer.transform() method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
Catego



CategoricalFeatureTransformer.transform() method, categorical_features: ['Dbtr_PstlAdr_Ctry', 'Cdtr_PstlAdr_Ctry', 'RgltryRptg_DbtCdtRptgInd', 'RgltryRptg_Authrty_Ctry', 'RgltryRptg_Dtls_Cd']
CategoricalFeatureTransformer X shape before transform actions:(278, 5)
TextFeatureTransformer.transform() method, feature_name: InstrForNxtAgt
X shape before transform actions:(278, 1)
X Shape after adding char_count_y_pred_proba:(278, 16)
X shape after dropping text feature:(278, 15)


1.0