# Fiddler examples have moved! [Deprecation Notice]

Dear user thank you for using fiddler product, we appreciate your time! We have moved the examples to a new github repo located at the following link


***
# [New fiddler-examples repo](https://github.com/fiddler-labs/fiddler-examples)
***

# Detecting drift in imbalanced scenarios

In this notebook we walk through a heavily imbalanced fraud prediction example. 
- We first train a model on the heavily imbalanced data with Random over-sampling
- We then introduce an uptick in fraud in production data. 
- Finally we add the model with Fiddler backend and use weighted aggregation to tease out the change in production distribution.

## Package installation and imports

In [None]:
!pip3 install fiddler-client>=1.2.0
!pip install imblearn

In [None]:
import logging
import sys
import numpy as np
import pandas as pd
import time
from copy import deepcopy
from sklearn.ensemble import RandomForestClassifier
import sklearn.utils
import collections
import warnings
warnings.filterwarnings("ignore")


# ML Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import auc, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler


import fiddler as fdl
print(f"Running client version {fdl.__version__}")

In [None]:
RANDOM_STATE = 42
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger(__name__)

## Preprocessing helper functions

Following cells preprocess the raw data before training 

In [None]:
def preprocess_data(df):
    std_scaler = StandardScaler()
    robust_scaler = RobustScaler()

    df['scaled_amount'] = robust_scaler.fit_transform(df['Amount'].values.reshape(-1, 1))
    df['scaled_time'] = robust_scaler.fit_transform(df['Time'].values.reshape(-1, 1))
    df.drop(['Time', 'Amount'], axis=1, inplace=True)

    scaled_amount = df['scaled_amount']
    scaled_time = df['scaled_time']

    df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
    df.insert(0, 'scaled_amount', scaled_amount)
    df.insert(1, 'scaled_time', scaled_time)
    return df

# Dataset and Label Distribution

The dataset contains transactions made by credit cards in September 2013 by European cardholders.

This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

Source: https://www.kaggle.com/mlg-ulb/creditcardfraud

In [None]:
DATA_URL = 'https://s3.us-west-1.amazonaws.com/fiddler.ai/download/creditcard_data.csv'

In [None]:
df = pd.read_csv(DATA_URL)
df = preprocess_data(df)
print('Class label distribbution (in percentage): ')
df['Class'].value_counts()/len(df)*100

### Computing class-weights
- We will compute the class-weighting factors based on the distribution of labels in baseline data

In [None]:
CLASS_WEIGHT = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(df['Class']), y=df['Class']).tolist()
print(f'Computed class-weights: {CLASS_WEIGHT}')

## Random Undersampling helper functions


In [None]:
def under_sample(df, target_col):
    target_dist = df[target_col].value_counts()
    targets = list(target_dist.index)
    min_samples = target_dist.min()
    sampled_dfs = []
    for target_key in targets:
        target_slice = df.loc[df[target_col]==target_key][:min_samples]
        sampled_dfs.append(target_slice)
    new_df = pd.concat(sampled_dfs)
    return new_df.sample(frac=1, random_state=42, replace=False)

def prepare_train_val(df, target_col, val_size=0.05):
    data_split = train_test_split(X, y, test_size=val_size, random_state=RANDOM_STATE)
    for dat_df in data_split:
        dat_df.reset_index(drop=True, inplace=True)
    return data_split

def prepare_test(df, target_col):
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    return X, y

## Model training helper functions

In [None]:
def train_model(model_obj, train_df, test_df, target_col='Class'):
    X_train = train_df.drop(target_col, axis=1).values
    y_train = train_df[target_col].values
    
    X_test = test_df.drop(target_col, axis=1).values
    y_test = test_df[target_col].values
    
    print(f'Training the model with {len(X_train)} samples')
    model_obj.fit(X_train, y_train)
    calc_roc_auc, calc_pr_auc = get_metrics(model_obj, X_test, y_test)
    print(
        f"Classsifier {model_obj.__class__.__name__} type has PR-AUC score of {calc_pr_auc} on test set"
    )
    return model_obj

def get_metrics(model_obj, X_test, y_test):
    y_pred = model_obj.predict(X_test)
    y_scores = model_obj.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_scores, pos_label=1)
    # print(fpr,  tpr, thresholds)
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    # Use AUC function to calculate the area under the curve of precision recall curve
    return roc_auc_score(y_test, y_pred), auc(recall, precision)

def get_predictions(model_obj, X, y):
    y_proba = pd.DataFrame.from_dict(
        {
            'prediction_score': model_obj.predict_proba(X)[:,1],
            'Class': y,
        }
    )
    y_proba.reset_index(drop=True, inplace=True)
    X.reset_index(drop=True, inplace=True)
    pred_train_df = pd.concat([X, y_proba], axis=1)
    return pred_train_df

# Train model with Random-Oversampling

In [None]:
t0 = time.time()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(train_df.drop(['Class'], axis=1), train_df['Class'])
train_df_oversampled = pd.concat([X_resampled, y_resampled], axis=1)

print(sorted(Counter(y_resampled).items()))

model_obj = RandomForestClassifier()
print('Traning the model. This might take a couple of minutes...')
model_obj = train_model(model_obj, train_df_oversampled, test_df, target_col='Class')

# get predictions from the trained model
X_baseline, y_baseline = prepare_test(train_df, target_col='Class')
baseline_pred = get_predictions(model_obj, X_baseline, y_baseline)

X_test, y_test = prepare_test(test_df, target_col='Class')
test_pred = get_predictions(model_obj, X_test, y_test)
t1 = time.time()
print(f'Time required to train the model: {(t1-t0)/60.} mins')

# Simulating increase in fraud

- To simulate increase in fraid we will oversample the minority class (Label being fraud) by a factor of 3

In [None]:
def simulate_fraud_uptick(
    df,
    target_col='Class',
    prediction_col = 'prediction_score',
    ratio=3.0,
):
    minority_class_df = df.loc[df[target_col]==1]
    majority_class_df = df.loc[df[target_col]==0]
    
    resampled_minority_class_df = minority_class_df.sample(frac=ratio, replace=ratio>1, random_state=RANDOM_STATE)
    resampled_df = pd.concat(
        [
            resampled_minority_class_df,
            majority_class_df
        ]
    )
    return resampled_df
uptick_fraud_df = simulate_fraud_uptick(test_pred)
uptick_fraud_df['Class'].value_counts()

# Upload Baseline and Add Models

## Connect to Fiddler

- Please change the following parameter to the server URL in use

In [None]:
# Connection parameters
org_id =  'imbalbash'
url = 'https://imbalbash.dev.fiddler.ai/'
auth_token= '_2ZDDooNZRhvw43MnVEMWCLA09CuTOC8KpV9COmeEz8'
client = fdl.FiddlerApi(url, org_id, auth_token, version=2)

### Set project, dataset and model ids
- Let's set-up project, model and dataset names

In [None]:
project_name = 'imbalanced_drift'
dataset_id = 'fraud_dataset'
model_id = 'oversampled_model'

BINARY_THRESHOLD = 0.4
TARGET_COL = 'Class'
OUTPUT_COL = 'prediction_score'

In [None]:
clean_up_entities = False
if clean_up_entities:
    logger.info(f'Deleting model: {model_id} from project: {project_name}')
    client.delete_model(project_name, model_id, delete_prod=True)
if clean_up_entities:
    logger.info(f'Deleting dataset: {dataset_id} from project: {project_name}')
    client.delete_dataset(project_name, dataset_id)
if clean_up_entities:
    logger.info(f'Deleting project: {project_name}')
    client.delete_project(project_name)

### Upload baseline dataset

In [None]:
baseline_df = baseline_pred.sample(frac=0.1, random_state=RANDOM_STATE)
event_df = uptick_fraud_df.sample(frac=0.1, random_state=RANDOM_STATE)
print(f'Adding model with {len(baseline_df)} samples in baseline ' 
      f'and {len(event_df)} samples in production' )

In [None]:
if not project_name in client.list_projects():
    logger.info(f'Creating project: {project_name}')
    client.create_project(project_name)
else:
    logger.info(f'Project: {project_name} already exists')

baseline_dataset_info = fdl.DatasetInfo.from_dataframe(
    baseline_pred,
    dataset_id=dataset_id,
    max_inferred_cardinality=100
)

if not dataset_id in client.list_datasets(project_id=project_name):
    logger.info(f'Upload dataset {dataset_id}')
    resp = client.upload_dataset(
        project_id=project_name, 
        dataset_id=dataset_id, 
        dataset={'baseline': baseline_df},
        info=baseline_dataset_info,
    )
else:
    logger.info(f'Dataset: {dataset_id} already exists in Project: {project_name}')
baseline_dataset_info

### Add Model
- We will add 2 models, with and without class-weighting parameters. 

In [None]:
for mid in [model_id + '_weighted', model_id]:
    if 'weighted' in mid:
        weighting_params = fdl.WeightingParams(class_weight=CLASS_WEIGHT)
        logger.info(f'Adding model with weighting parameters.')
    else:
        weighting_params = None
        logger.info(f'Adding model without weighting parameters.')
    target_col = TARGET_COL
    output_col = OUTPUT_COL
    inp_features = set(baseline_df.columns) - set([target_col, output_col])
    model_info = fdl.ModelInfo.from_dataset_info(
        dataset_info=baseline_dataset_info,
        target=target_col,
        dataset_id= dataset_id,
        features=inp_features,
        display_name='Fraud model',
        description='Fraud model with predictions in baseline',
        input_type=fdl.core_objects.ModelInputType.TABULAR,
        model_task=fdl.core_objects.ModelTask.BINARY_CLASSIFICATION,
        outputs=output_col,
        weighting_params=weighting_params,
        binary_classification_threshold=BINARY_THRESHOLD,
        categorical_target_class_details=[0, 1],

    )
    if not model_id in client.list_models(project_id=project_name):
        client.add_model(project_id=project_name, model_id=mid, dataset_id=dataset_id, model_info=model_info)
    else:
        logger.info(f'Model: {mid} already exists in Project: {project_name}')

    t0 = time.time()
    logger.info('Publishing events...')
    client.publish_events_batch(
        project_id=project_name,
        model_id=mid,
        batch_source=event_df,
    )
    t1 = time.time()
    dt = t1-t0
    print(f'Time required: {dt} secs for {len(uptick_fraud_df)} events. [{len(uptick_fraud_df)/dt} events/sec]')

In [None]:
landing_url = client.v2.client_v2.url.split('/v2')[0] + '/projects/' + project_name
print(f'You can now navigate to {landing_url} to compare drift calculated by weighted and non-weighted models.')
print(f'You will notice that the class-weighted drift chart is able to surface the uptick in fraud.')