# AI on IBM Z Model Development

## Import required python packages

In [None]:
import numpy as np
import pandas as pd
import json
import time
import math

# Model training
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Data preprocessing
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

# PMML
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer
from sklearn2pmml.pipeline import PMMLPipeline

## Input dataset and label

In [None]:
# User must provide filepath to dataset and label name
DATASET_FILENAME = 'datasets/credit_card_transactions-ibm_v2.csv'
DATASET_LABEL_NAME = 'Is Fraud?'

## Split features and labels from dataset

In [None]:
def split_features_and_lables(dataset_df, label):
    features = dataset_df.copy()
    labels = features.pop(label)

    return features, labels

## Machine learning model training

### Train any model using cross validation

In [None]:
def training_pipeline(model_type, name, hyperparameters, X, y, cat_feats, num_feats):
    print('Splitting dataset for training and testing...\n')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    print('Training ' + name + '...\n')
    start_time = time.time()
    
    feature_transformers = []
    if len(num_feats) > 0:
        numeric_transformer = Pipeline(steps=[('normalizer', StandardScaler())])
        feature_transformers.append(('num', numeric_transformer, num_feats))
    
    if len(cat_feats) > 0:
        categorical_transformer = Pipeline(steps=[('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
        feature_transformers.append(('cat', categorical_transformer, cat_feats))
    
    preprocessor = ColumnTransformer(transformers=feature_transformers)

    ai_pipeline = PMMLPipeline([
        ("preprocessor", preprocessor),
        ("classifier", model_type)
    ])

    # train using cross validation
    clf = GridSearchCV(ai_pipeline, hyperparameters)
    clf.fit(X=X_train, y=y_train)
    
    print('\tOptimal hyperparameters:')
    print('\t' + str(clf.best_params_))

    # save best performing model
    best_pipeline = clf.best_estimator_

    print('\tFeature importances:')
    for name, importance in zip(X_train.columns, best_pipeline.named_steps.classifier.feature_importances_):
        print(name, "=", importance)

    end_time = time.time()
    total_time = (end_time - start_time) / 60

    print('\n\tTraining time (mins): \n\t' + str(total_time) + '\n')

    # evaluate model performance
    y_pred = best_pipeline.predict(X_test)

    if classification_problem:
        # this will work successfully for a classification problem
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
    else:
        # this is a regression problem
        # above scoring functions don't work for regression
        accuracy = clf.score(X_test, y_test)
        precision = 'N/A'
        recall = 'N/A'
        f1 = 'N/A'

    # Export pipeline in pmml format
    sklearn2pmml(best_pipeline, 'models/' + name + ".xml", with_repr=True)

    return best_pipeline, precision, recall, f1, accuracy

### Scikit-learn model training

In [None]:
# Sckit-learn - Random Forest (Classification)
def train_random_forest(X, y, cat_feats, num_feats):

    # hyperparameters
    n_estimators = [10] #[5, 10, 100, 500]
    criterions = ['entropy'] #['gini', 'entropy']
    max_features = [None] #[None , 'auto', 'sqrt', 'log2']
    max_depths = [None] #[None , 1000, 100, 10, 5]
    min_samples_splits = [2] #[2 , 10, 100]
    min_samples_leafs = [1] #[1 , 5, 10, 50]
    max_leaf_nodes = [None] #[None , 5, 10, 100]
    bootstraps = [True] #[True, False]
    random_states = [33]

    hyperparameters = {'classifier__n_estimators': n_estimators,
                       'classifier__random_state': random_states,
                       'classifier__criterion': criterions,
                       'classifier__max_features': max_features,
                       'classifier__max_depth': max_depths,
                      'classifier__min_samples_split': min_samples_splits,
                      'classifier__min_samples_leaf': min_samples_leafs,
                      'classifier__max_leaf_nodes': max_leaf_nodes,
                      'classifier__bootstrap': bootstraps
                     }

    return training_pipeline(RandomForestClassifier(), 'RandomForestClassifier', hyperparameters, X, y, cat_feats, num_feats)


# Scikit-learn - Gradient Boosting Tree (Classification)
def train_gradient_boosted_tree_sklearn(X, y, cat_feats, num_feats):

    # hyperparameters
    # base_estimators = [base_estimator] #[None, base_estimator]
    n_estimators = [10] #[5, 10, 50, 100, 500]
    learning_rates = [0.9] #[1.0, 0.9, 0.1, 0.0001]
    # loss = ['exponential'] #['log_loss', 'exponential']
    random_states = [33]

    hyperparameters = {'classifier__n_estimators': n_estimators,
                       'classifier__learning_rate': learning_rates,
                    #    'classifier__loss': loss,
                       'classifier__random_state': random_states
                      }

    return training_pipeline(GradientBoostingClassifier(), 'GradientBoostingClassifier', hyperparameters, X, y, cat_feats, num_feats)


# Sckit-learn - Random Forest (Regression)
def train_random_forest_regression(X, y, cat_feats, num_feats):

    # hyperparameters
    n_estimators = [10] #[5, 10, 100, 500]
    max_features = [None] #[None , 'auto', 'sqrt', 'log2']
    max_depths = [None] #[None , 1000, 100, 10, 5]
    min_samples_splits = [2] #[2 , 10, 100]
    min_samples_leafs = [1] #[1 , 5, 10, 50]
    max_leaf_nodes = [None] #[None , 5, 10, 100]
    bootstraps = [True] #[True, False]
    random_states = [33]

    hyperparameters = {'classifier__n_estimators': n_estimators,
                       'classifier__random_state': random_states,
                       'classifier__max_features': max_features,
                       'classifier__max_depth': max_depths,
                      'classifier__min_samples_split': min_samples_splits,
                      'classifier__min_samples_leaf': min_samples_leafs,
                      'classifier__max_leaf_nodes': max_leaf_nodes,
                      'classifier__bootstrap': bootstraps
                     }

    return training_pipeline(RandomForestRegressor(), 'RandomForestRegressor', hyperparameters, X, y, cat_feats, num_feats)

# Scikit-learn - Gradient Boosting Tree (Regression)
def train_gradient_boosted_tree_sklearn_regression(X, y, cat_feats, num_feats):

    # hyperparameters
    # base_estimators = [base_estimator] #[None, base_estimator]
    n_estimators = [10] #[5, 10, 50, 100, 500]
    learning_rates = [0.9] #[1.0, 0.9, 0.1, 0.0001]
    # loss = ['exponential'] #['log_loss', 'exponential']
    random_states = [33]

    hyperparameters = {'classifier__n_estimators': n_estimators,
                       'classifier__learning_rate': learning_rates,
                    #    'classifier__loss': loss,
                       'classifier__random_state': random_states
                      }

    return training_pipeline(GradientBoostingRegressor(), 'GradientBoostingRegressor', hyperparameters, X, y, cat_feats, num_feats)

### Train models with different frameworks

In [None]:
# Wrapper function to train any model type
def train_model(model_type, X, y, cat_feats, num_feats):
    pipeline = None
    precision = None
    recall = None
    f1 = None
    num_features = X.shape[1]

    if model_type == 'random_forest_sklearn':
        pipeline, precision, recall, f1, accuracy = train_random_forest(X, y, cat_feats, num_feats)
    elif model_type == 'gradient_boosted_tree_sklearn':
        pipeline, precision, recall, f1, accuracy = train_gradient_boosted_tree_sklearn(X, y, cat_feats, num_feats)
    elif model_type == 'random_forest_sklearn_regression':
        pipeline, precision, recall, f1, accuracy = train_random_forest_regression(X, y, cat_feats, num_feats)
    elif model_type == 'gradient_boosted_tree_sklearn_regression':
        pipeline, precision, recall, f1, accuracy = train_gradient_boosted_tree_sklearn_regression(X, y, cat_feats, num_feats)
        
    return pipeline, precision, recall, f1, accuracy

## Data cleaning

In [None]:
def data_cleaning(dataset_df):
    print('Performing basic data cleaning...')
    
    # Data cleaning - remove unneeded chars
    print('Removing unneeded special characters...')
    cols = dataset_df.columns
    dataset_df[cols] = dataset_df[cols].replace({'\$': '', ',': '', ':': '', '\+': '', '\+': '', '\#': '', '\/': ''}, regex=True)

    # Check for missing values
    print('Number of missing values:\n' + str(dataset_df.isna().sum()) + '\n')

    print('Replacing missing values and 2nd pass for num/cat determination...')

    categorical_features = list(dataset_df.select_dtypes(include = "object").columns)
    
    cat_feats = []
    num_feats = []

    num_feats = list(dataset_df.select_dtypes(exclude='object').columns)
    dataset_df[num_feats] = dataset_df[num_feats].fillna(0)

    
    for feature in categorical_features:
        feature_value = dataset_df.loc[1, feature]

        try:
            if str(feature_value).replace('.','').replace('-','').isnumeric() == False:
                print(feature +' type is STRING')
                # Replace any NaN's with 'na'
                dataset_df[feature] = dataset_df[feature].fillna('na')
                cat_feats.append(feature)
            elif str(feature_value).replace('.','').replace('-','').isnumeric() == True:
                print(feature +' type is NUMERIC')
                dataset_df[feature] = dataset_df[feature].astype(np.float64) if '.' in str(feature_value) else dataset_df[feature].astype(np.int64) 
                num_feats.append(feature)
            else:
                print(feature +' type is UNKNOWN')
                dataset_df[feature] = dataset_df[feature].fillna('na')
                cat_feats.append(feature)                    
        except:
            print(feature +' gives an ERROR')
            dataset_df[feature] = dataset_df[feature].fillna('na')
            cat_feats.append(feature)

    dataset_df[cat_feats] = dataset_df[cat_feats].astype(str)
    
    # COBOL supports numeric literals up to 18 digits in length. Below section truncates the numeric 
    # features if they exceed 18 digits. So that the same data is used for training, testing and inferencing.
    # Alternatively these columns can be reclasified from numerical to categorical features
    cobol_max_number = (10 ** 18) - 1 # max number of 9's
    max_values = dataset_df[num_feats].max(numeric_only=True)
    feat_index = 0
    for max_value in max_values:
        if abs(max_value) > cobol_max_number:
            # Downscale factor in powers of 10. COBOL takes significant 18 digits and truncates remaining
            # e.g. for features as 'Merchant Name' which are 19 digits, the downscale_factor will be 10
            downscale_factor = 10 ** round(math.log10(max_value/cobol_max_number))
            dataset_df[num_feats[feat_index]] = (dataset_df[num_feats[feat_index]]/downscale_factor).astype(np.int64)
        feat_index += 1

    print('Data cleaning successful!\n')

    return dataset_df, cat_feats, num_feats

## Fetch and process data

In [None]:
# Currently, datasets for the following AI on IBM Z Solution Templates are supported:
# - Fraud Detection
# - Credit Risk Assessment

# Verify that required features are in dataset
required_features = []
required_features_CCF = ['User', 'Card', 'Year', 'Month', 'Day', 'Time', 'Amount', 'Use Chip', 'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC', 'Errors',  'Is Fraud?']
required_features_CRA = ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_status', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
required_features_HIC = ['applicant_id', 'years_of_insurance_with_us', 'regular_checkup_last_year', 'adventure_sports', 'Occupation', 'visited_doctor_last_1_year', 'cholesterol_level', 'daily_avg_steps', 'age', 'heart_decs_history', 'Any_other_major_decs_history', 'Gender', 'avg_glucose_level', 'bmi', 'smoking_status', 'Year_last_admitted', 'Location', 'weight', 'covered_by_any_other_company', 'Alcohol', 'exercise', 'weight_change_in_last_one_year', 'fat_percentage', 'insurance_cost']

dataset_df = pd.read_csv(DATASET_FILENAME, nrows=2)

if set(required_features_CCF).issubset(dataset_df.columns):
    required_features = required_features_CCF
elif set(required_features_CRA).issubset(dataset_df.columns):
    required_features = required_features_CRA
elif set(required_features_HIC).issubset(dataset_df.columns):
    required_features = required_features_HIC
    
# Load dataset from file
if len(required_features) != 0:
    print('Loading dataset...')
    
    # In order to minimize training time, we will only use a subset of the provided data (keep every nth line)
    n = 10
    dataset_df = pd.read_csv(DATASET_FILENAME, header=0, usecols=required_features, skiprows=lambda i: i % n != 0)
    
    # If you want to use the complete dataset, uncomment the follow line and comment the above code
    # dataset_df = pd.read_csv(DATASET_FILENAME, usecols=required_features)
    
    print('Dataset shape: ' + str(dataset_df.shape) + '\n')

    X, y = split_features_and_lables(dataset_df, DATASET_LABEL_NAME)

    X, cat_feats, num_feats = data_cleaning(X)
else:
    print('Please provide dataset with the following features:')
    print(required_features)
    dataset_df = None
    X = None
    y = None
    cat_feats = None
    num_feats = None

print('AI Dataset (cleaned):')
X

## Train machine learning model

In [None]:
model_details = []


# Determine if this is a classification or regression problem
classification_problem = True

if len(y.unique()) > 10:
    classification_problem = False

if classification_problem:
    model_rf_sklearn, precision, recall, f1, accuracy = train_model('random_forest_sklearn', X, y, cat_feats, num_feats)
    model_details_rf_sklearn = {
        'name': 'random_forest_sklearn',
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    model_details.append(model_details_rf_sklearn)

    model_gb_sklearn, precision, recall, f1, accuracy = train_model('gradient_boosted_tree_sklearn', X, y, cat_feats, num_feats)
    model_details_gb_sklearn = {
        'name': 'gradient_boosted_tree_sklearn',
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    model_details.append(model_details_gb_sklearn)

else:
    print('\t** this is a regression problem, using regression models instead **\n')
    
    model_rf_sklearn_rg, precision, recall, f1, accuracy = train_model('random_forest_sklearn_regression', X, y, cat_feats, num_feats)
    model_details_rf_sklearn_rg = {
        'name': 'random_forest_sklearn_regression',
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    model_details.append(model_details_rf_sklearn_rg)
    
    model_gb_sklearn_rg, precision, recall, f1, accuracy = train_model('gradient_boosted_tree_sklearn_regression', X, y, cat_feats, num_feats)
    model_details_gb_sklearn_rg = {
        'name': 'gradient_boosted_tree_sklearn_regression',
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    model_details.append(model_details_gb_sklearn_rg)

## Recommend best ML model

In [None]:
model_details_sorted = sorted(model_details, key=lambda d: d['accuracy'], reverse=True)
print('AI model details (ranked by best performance)')
print(json.dumps(model_details_sorted, indent=4))