# Experiments with Berka Dataset (Classification Tasks)

Supervised setting: choose stable series and label existing points as recurrent (label 1) then add noise or overlap two different series and see how well each method performs

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Choose between one of these
BASE_DATA_PATH = '../datasets/trans.asc'    # BERKA
BASE_DATA_PATH = '../datasets/automated_transfers_bank.pqt'  # BANK_AUTOMATED

In [3]:
import dbscanmethod
import graphsmethod as graphmethod
import matrixmethod
from yousi import DetectRecurrencyII

import noiser
import swifter

import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import (
    classification_report, accuracy_score, f1_score, precision_score, recall_score,
    recall_score, roc_curve, RocCurveDisplay, auc
)

import mercury.viz
plt.style.use('mercury')

## Read Berka Dataset and keep only stable series

In [4]:
min_length = 4 # minimum length of series
min_std = 5 # minimum std to assure stable series
N_LAST_POINTS = 20   

var_pk = 'payment_channel'
var_date = 'date'
var_amnt = 'amount'

In [5]:
%%capture
if 'trans.asc' in BASE_DATA_PATH:
    df = pd.read_table(BASE_DATA_PATH, sep=';', low_memory=False)
    df = df.loc[~df.account.isna()] 
    
    df.loc[:, 'account'] = df.loc[:, 'account'].astype(int)
    
    df['payment_channel'] = df.account_id.astype(str) + '-' + df.account.astype(str) +'-' + df.operation.astype(str) + '-' + df.type.astype(str)
    df['date'] =pd.to_datetime(df['date'].astype(str), format='%y%m%d')

if 'automated' in BASE_DATA_PATH:
    df = pd.read_parquet('../datasets/bank_automated_transfers.pqt')
    df['date'] =pd.to_datetime(df['date'].astype(str))#, format='%y%m%d')
    df['type'] = 1  # Needed for YOUSFI method.
    
df[var_amnt] = df[var_amnt].abs()

df = df.loc[:, [var_pk, var_date, var_amnt, 'type']]

In [6]:
print(f'Raw datset: there are {df.shape[0]} transactions and a total of {df.payment_channel.nunique()} unique payment channels')

Raw datset: there are 476882 transactions and a total of 47677 unique payment channels


In [7]:
print(f"Taking last {N_LAST_POINTS} of each payment_channel...")
df = df.groupby('payment_channel').tail(N_LAST_POINTS)#(100)

Taking last 20 of each payment_channel...


In [8]:
# We assume that very "amount-flat" series are stable
stds = df.groupby(var_pk)[var_amnt].std().sort_values(ascending=False).dropna()
stable_payment_channels = set(stds[stds < min_std].index.tolist())
df_stable = df[df[var_pk].isin(stable_payment_channels)]

print(f'Stable series: {df_stable[var_pk].nunique()}, this represents a {round(df_stable[var_pk].nunique()/df[var_pk].nunique()*100,2)}% of total no. of series')
print(f'Stable total transactions: {len(df_stable)}, this represents a {round(len(df_stable)/len(df)*100,2)}% of total no. of transactions')

Stable series: 46077, this represents a 96.64% of total no. of series
Stable total transactions: 458148, this represents a 96.47% of total no. of transactions


In [9]:
df = df_stable.copy()
lelele = df.sort_values([var_pk, var_date])\
    .groupby(var_pk)[var_date]\
    .diff(1).dt.days


tdf = df.join(lelele, rsuffix='_diffdays')
posid = tdf.groupby(['payment_channel']).cumcount().rename('group_position')
tdf = tdf.join(posid)

lelele = tdf.sort_values(['payment_channel',  'date'])\
.groupby(['payment_channel'])\
.apply(noiser.detect_breaks_wrapped)

lelele = lelele.explode(['is_break', 'group_position'])

have_no_breaks = lelele.groupby('payment_channel')['is_break'].max()
have_no_breaks = set(have_no_breaks[have_no_breaks == False].index.tolist())
df = df.loc[df.payment_channel.isin(have_no_breaks)]
print(f"After removing series with at least a break we lose {df_stable.payment_channel.nunique() - df.payment_channel.nunique()} series")

After removing series with at least a break we lose 0 series


In [10]:
# Remove payment channels with too few transactions
cnts = df.groupby(var_pk)[var_pk].count().sort_values(ascending=True)
df_filter = df.loc[df[var_pk].isin(cnts[cnts > min_length].index)]
print(f'Short series removed = {df.payment_channel.nunique()- df_filter[var_pk].nunique()}, {round((df[var_pk].nunique()- df_filter[var_pk].nunique())/df[var_pk].nunique()*100,2)}% ')

Short series removed = 6161, 13.37% 


In [11]:
df = df_filter.copy()
df = df.sort_values(['payment_channel','date'])

In [12]:
print("Computing diffdays...")
# compute diffdays
lelele = df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()

df = df.join(lelele.rename('datediff'))

df = df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = df.groupby(['payment_channel']).cumcount().rename('group_position')
df = df.join(posid)

stds = df.groupby(['payment_channel'])['amount'].std()

stable_channels = df.loc[df.payment_channel.isin(set(stds[stds < 5].index.tolist()))]


stable_channels = stable_channels.drop_duplicates(['payment_channel', 'amount', 'date'])  # we wont consider series with several identical payments on the same day

Computing diffdays...


In [13]:
df_tmp = stable_channels.copy()
df_tmp.drop_duplicates(subset=['payment_channel', 'date', 'amount'], keep='first', inplace=True, ignore_index=True)
print(f'check there are no duplicates == {df_tmp.shape[0] == df.shape[0]}: series with same date and amount for the same payment channel')

check there are no duplicates == True: series with same date and amount for the same payment channel


In [14]:
stable_channels['day'] = stable_channels['date'].apply(lambda r:r.day)
df_stats = stable_channels.groupby([var_pk])['day'].nunique()
df_stats.describe()

count    39916.000000
mean         3.503883
std          1.074926
min          1.000000
25%          3.000000
50%          3.000000
75%          4.000000
max         20.000000
Name: day, dtype: float64

## Experiment 1: Add new noise to stable series (setting 0.05)

In [15]:
prob_perturbation = .05  # Probability to add a new point after each point of a serie
n_desv_outlier = 3       # multiplier std 

In [16]:
df = stable_channels.copy()
#df = stable_channels_tmp.copy()
df.shape

(436615, 7)

In [18]:
%%capture
noised_df = noiser.add_noise_update_exp1(
    dataframe=df,
    col_pk=var_pk,
    col_amnt=var_amnt,
    col_date=var_date,
    n_desv_outlier=n_desv_outlier,
    prob_perturbation=prob_perturbation
)

noised_df = noised_df.copy().drop(['datediff', 'group_position'], axis=1)

print("Computing diffdays...")
# compute diffdays
lelele = noised_df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()

noised_df = pd.merge(noised_df, lelele.rename('datediff'), left_index=True, right_index=True, how='left')

noised_df = noised_df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = noised_df.groupby(['payment_channel']).cumcount().rename('group_position')
noised_df = noised_df.join(posid)

#dup_cols = ['trans_id', 'date', 'amount'] if 'trans_id' in noised_df.columns else ['date', 'amount']
#noised_df.drop_duplicates(subset=dup_cols, keep='first', inplace=True, ignore_index=True)


In [19]:
assert len(noised_df) > len(df), f"Noised DF should be larger since we added points!  ({len(noised_df)} vs {len(df)})"

### Assesment Exp 1

#### DBSCAN

In [20]:
df = noised_df.copy()

# Save original label
df['is_rec_orii'] =  df.is_rec

df.is_rec.value_counts() / len(df)

1    0.878488
0    0.121512
Name: is_rec, dtype: float64

In [21]:
%%time
df = dbscanmethod.main_dbscan_method(df)
df['cluster_id_dbscan'] = df['is_rec']
df['is_rec'] = df['is_rec_orii']
#df = df.drop('is_rec_ori', axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67657/67657 [02:00<00:00, 561.82it/s]


CPU times: user 2min 2s, sys: 4.62 s, total: 2min 7s
Wall time: 2min 8s


#### Matrix 

In [22]:
%%time
def matrix_flag(data, use_dbscan=False):
    dates = data[var_date].values
    diff_days = data[var_date].diff(1).dt.days.dropna().values
    amounts = data[var_amnt].values
    orders = data.group_position.values

    subseries = matrixmethod.main_matrix_method(diff_days, amounts, use_dbscan=use_dbscan)
    
    flags = np.ones(len(data)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return pd.Series({'cluster_id_matrix_binning': flags, 'group_position': orders})


bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_flag, use_dbscan=False)\
    .explode(['cluster_id_matrix_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])

df.loc[df.cluster_id_matrix_binning >= 0, 'cluster_id_matrix_binning'] = 1
df.loc[df.cluster_id_matrix_binning < 0, 'cluster_id_matrix_binning'] = 0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39916/39916 [00:53<00:00, 746.12it/s]


CPU times: user 51.5 s, sys: 932 ms, total: 52.4 s
Wall time: 53.9 s


#### Graphs

In [23]:
%%time
def flag_matrix(dates, amounts):    
    dates = np.array(dates).astype(np.datetime64)
    amounts = np.array(amounts)
    
    datediffs_to_previous = np.diff(dates).astype('timedelta64[D]')
    datediffs_to_previous = datediffs_to_previous/ np.timedelta64(1, 'D')
    subseries = graphmethod.main_matrix_method_graphs(datediffs_to_previous, amounts, use_dbscan=False)

    flags = np.ones(len(dates)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return flags
    
def matrix_udf_graphs(data):
    dates = data.date.values
    amounts = data.amount.values
    orders = data.group_position.values
    
    subseries_ids = flag_matrix(dates, amounts)
    
    return pd.Series({'cluster_id_matrix_graph_binning': subseries_ids, 'group_position': orders})

bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_udf_graphs)\
    .explode(['cluster_id_matrix_graph_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])
df.loc[df.cluster_id_matrix_graph_binning >= 0, 'cluster_id_matrix_graph_binning'] = 1
df.loc[df.cluster_id_matrix_graph_binning < 0, 'cluster_id_matrix_graph_binning'] = 0


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39916/39916 [01:29<00:00, 446.04it/s]


CPU times: user 1min 27s, sys: 1.1 s, total: 1min 29s
Wall time: 1min 29s


#### Yousfi (baseline)

In [24]:
%%capture
class Config:
    def __init__(self):
        self.type_col='type'
        self.client_col= 'payment_channel'
        self.customer_id= 'payment_channel'
        self.time_col=  'date'
        self.amount_col='amount'
        self.trans_amount='amount'
        self.trans_date=  'date'
        self.trans_type=  'type'


config = Config()

def get_fn(df):
    dfs =  list((DetectRecurrencyII(
                  trans_data = df,
                  client_col= 'payment_channel',
                  time_col=  'date',
                  amount_col='amount',
                  config=config
                  )
           )[1].values())

    #import pdb; pdb.set_trace()
    # Add a cluster_id col for all dfs
    try:
        dfs = [dfs[i].assign(cluster_id = i).reset_index()  for i, d in enumerate(dfs)]
    except:
        import pdb; pdb.set_trace()

    if len(dfs) > 0:
        concat_df = pd.concat(dfs)


        out = pd.merge(df, 
              concat_df, 
             left_on=['date', 'amount'], 
             right_on=['date', 'amount'], how='left')

        dfs = out
    else:
        dfs = df.assign(cluster_id = -1)

    return dfs.loc[:, ['date', 'amount', 'cluster_id']].reset_index(drop=True)    


dft = df.copy()

outt = dft.sort_values(['payment_channel', 'date']).groupby('payment_channel').apply(lambda x: get_fn(x))
dft = pd.merge(dft, outt.reset_index().drop('level_1', axis=1), on=('payment_channel', 'date', 'amount'))
dft['cluster_id'] = dft['cluster_id'].fillna(-1).astype(int)

df['cluster_id_yousfi'] = dft.cluster_id

df.loc[df['cluster_id_yousfi'] >= 0, 'cluster_id_yousfi'] = 1
df.loc[df['cluster_id_yousfi'] < 0, 'cluster_id_yousfi'] = 0

#### Metrics

In [25]:
for method in ('cluster_id_matrix_graph_binning', 'cluster_id_matrix_binning', 'cluster_id_dbscan', 'cluster_id_yousfi'):
    print(f"REPORT FOR METHOD: {method} -----------------")
    print(classification_report(df.is_rec.values, df[method].values.astype(int)))

    print()

REPORT FOR METHOD: cluster_id_matrix_graph_binning -----------------
              precision    recall  f1-score   support

           0       0.74      0.96      0.84     63266
           1       0.99      0.95      0.97    457392

    accuracy                           0.96    520658
   macro avg       0.87      0.96      0.91    520658
weighted avg       0.96      0.96      0.96    520658


REPORT FOR METHOD: cluster_id_matrix_binning -----------------
              precision    recall  f1-score   support

           0       0.75      0.99      0.86     63266
           1       1.00      0.96      0.98    457392

    accuracy                           0.96    520658
   macro avg       0.88      0.97      0.92    520658
weighted avg       0.97      0.96      0.96    520658


REPORT FOR METHOD: cluster_id_dbscan -----------------
              precision    recall  f1-score   support

           0       0.70      0.79      0.74     63266
           1       0.97      0.95      0.96    4

#### Visualizations

In [26]:
#get list of all PK
#pks = df.groupby('payment_channel')['is_rec'].sum().sort_values(ascending=False)
#pks = pks.index.values.tolist() 
#print (len(pks))

In [27]:
#plot some samples
#for i in pks[0:5]:
#    t = df.loc[(df.payment_channel == i)].copy()
#    t[t.amount > 0]
#    #t.amount_lvl =  t.amount_lvl.astype(str)
#
#    print('...............................')
#    print(f' primary key: {i}')
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_yousfi'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_graph_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_dbscan'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()

## Experiment 1: Add new noise to stable series (setting 0.25)

We can repeat the first experiment increasing the number of noise to be added

In [28]:
prob_perturbation = .25  # Probability to add a new point after each point of a serie
n_desv_outlier = 3       # multiplier std 

In [29]:
df = stable_channels.copy()
#df = stable_channels_tmp.copy()
df.shape

(436615, 7)

In [30]:
%%capture
noised_df = noiser.add_noise_update_exp1(
    dataframe=df,
    col_pk=var_pk,
    col_amnt=var_amnt,
    col_date=var_date,
    n_desv_outlier=n_desv_outlier,
    prob_perturbation=prob_perturbation
)

noised_df = noised_df.copy().drop(['datediff', 'group_position'], axis=1)
print(noised_df.shape)

print("Computing diffdays...")
# compute diffdays
lelele = noised_df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()


noised_df = pd.merge(noised_df, lelele.rename('datediff'), left_index=True, right_index=True, how='left')

noised_df = noised_df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = noised_df.groupby(['payment_channel']).cumcount().rename('group_position')
noised_df = noised_df.join(posid)

#dup_cols = ['trans_id', 'date', 'amount'] if 'trans_id' in noised_df.columns else ['date', 'amount']
#noised_df.drop_duplicates(subset=dup_cols, keep='first', inplace=True, ignore_index=True)

print (noised_df.shape)

In [31]:
assert len(noised_df) > len(df), "Noised DF should be larger since we added points!"

### Assesment Exp 1

#### DBSCAN

In [32]:
df = noised_df.copy()

# Save original label
df['is_rec_orii'] =  df.is_rec

df.is_rec.value_counts() / len(df)

1    0.654989
0    0.345011
Name: is_rec, dtype: float64

In [33]:
%%time
df = dbscanmethod.main_dbscan_method(df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100218/100218 [02:57<00:00, 563.69it/s]


CPU times: user 2min 59s, sys: 6.47 s, total: 3min 5s
Wall time: 3min 6s


In [34]:
df['cluster_id_dbscan'] = df['is_rec']
df['is_rec'] = df['is_rec_orii']
#df = df.drop('is_rec_ori', axis=1)

#### Matrix 

In [35]:
%%time
def matrix_flag(data, use_dbscan=False):
    dates = data[var_date].values
    diff_days = data[var_date].diff(1).dt.days.dropna().values
    amounts = data[var_amnt].values
    orders = data.group_position.values

    subseries = matrixmethod.main_matrix_method(diff_days, amounts, use_dbscan=use_dbscan)
    
    flags = np.ones(len(data)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return pd.Series({'cluster_id_matrix_binning': flags, 'group_position': orders})


bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_flag, use_dbscan=False)\
    .explode(['cluster_id_matrix_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])

df.loc[df.cluster_id_matrix_binning >= 0, 'cluster_id_matrix_binning'] = 1
df.loc[df.cluster_id_matrix_binning < 0, 'cluster_id_matrix_binning'] = 0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39916/39916 [01:40<00:00, 397.51it/s]


CPU times: user 1min 36s, sys: 1.35 s, total: 1min 38s
Wall time: 1min 41s


#### Graphs

In [36]:
%%time
def flag_matrix(dates, amounts):    
    dates = np.array(dates).astype(np.datetime64)
    amounts = np.array(amounts)
    
    datediffs_to_previous = np.diff(dates).astype('timedelta64[D]')
    datediffs_to_previous = datediffs_to_previous/ np.timedelta64(1, 'D')
    subseries = graphmethod.main_matrix_method_graphs(datediffs_to_previous, amounts, use_dbscan=False)

    flags = np.ones(len(dates)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return flags
    
def matrix_udf_graphs(data):
    dates = data.date.values
    amounts = data.amount.values
    orders = data.group_position.values
    
    subseries_ids = flag_matrix(dates, amounts)
    
    return pd.Series({'cluster_id_matrix_graph_binning': subseries_ids, 'group_position': orders})

bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_udf_graphs)\
    .explode(['cluster_id_matrix_graph_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])
df.loc[df.cluster_id_matrix_graph_binning >= 0, 'cluster_id_matrix_graph_binning'] = 1
df.loc[df.cluster_id_matrix_graph_binning < 0, 'cluster_id_matrix_graph_binning'] = 0

print(classification_report(df.is_rec.values, df.cluster_id_matrix_graph_binning.values.astype(int)))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39916/39916 [06:29<00:00, 102.52it/s]


              precision    recall  f1-score   support

           0       0.72      0.63      0.67    273213
           1       0.82      0.87      0.84    518684

    accuracy                           0.79    791897
   macro avg       0.77      0.75      0.76    791897
weighted avg       0.78      0.79      0.78    791897

CPU times: user 6min 16s, sys: 2.92 s, total: 6min 19s
Wall time: 6min 30s


#### Yousfi (baseline)

In [37]:
%%capture
class Config:
    def __init__(self):
        self.type_col='type'
        self.client_col= 'payment_channel'
        self.customer_id= 'payment_channel'
        self.time_col=  'date'
        self.amount_col='amount'
        self.trans_amount='amount'
        self.trans_date=  'date'
        self.trans_type=  'type'


config = Config()

def get_fn(df):
    dfs =  list((DetectRecurrencyII(
                  trans_data = df,
                  client_col= 'payment_channel',
                  time_col=  'date',
                  amount_col='amount',
                  config=config
                  )
           )[1].values())

    #import pdb; pdb.set_trace()
    # Add a cluster_id col for all dfs
    try:
        dfs = [dfs[i].assign(cluster_id = i).reset_index()  for i, d in enumerate(dfs)]
    except:
        import pdb; pdb.set_trace()

    if len(dfs) > 0:
        concat_df = pd.concat(dfs)


        out = pd.merge(df, 
              concat_df, 
             left_on=['date', 'amount'], 
             right_on=['date', 'amount'], how='left')

        dfs = out
    else:
        dfs = df.assign(cluster_id = -1)

    return dfs.loc[:, ['date', 'amount', 'cluster_id']].reset_index(drop=True)    


dft = df.copy()

outt = dft.sort_values(['payment_channel', 'date']).groupby('payment_channel').apply(lambda x: get_fn(x))
dft = pd.merge(dft, outt.reset_index().drop('level_1', axis=1), on=('payment_channel', 'date', 'amount'))
dft['cluster_id'] = dft['cluster_id'].fillna(-1).astype(int)

df['cluster_id_yousfi'] = dft.cluster_id

df.loc[df['cluster_id_yousfi'] >= 0, 'cluster_id_yousfi'] = 1
df.loc[df['cluster_id_yousfi'] < 0, 'cluster_id_yousfi'] = 0



#### Metrics

In [38]:
for method in ('cluster_id_matrix_graph_binning', 'cluster_id_matrix_binning', 'cluster_id_dbscan', 'cluster_id_yousfi'):
    print(f"REPORT FOR METHOD: {method} -----------------")
    print(classification_report(df.is_rec.values, df[method].values.astype(int)))

    print()

REPORT FOR METHOD: cluster_id_matrix_graph_binning -----------------
              precision    recall  f1-score   support

           0       0.72      0.63      0.67    273213
           1       0.82      0.87      0.84    518684

    accuracy                           0.79    791897
   macro avg       0.77      0.75      0.76    791897
weighted avg       0.78      0.79      0.78    791897


REPORT FOR METHOD: cluster_id_matrix_binning -----------------
              precision    recall  f1-score   support

           0       0.76      0.88      0.82    273213
           1       0.93      0.85      0.89    518684

    accuracy                           0.86    791897
   macro avg       0.85      0.87      0.85    791897
weighted avg       0.87      0.86      0.87    791897


REPORT FOR METHOD: cluster_id_dbscan -----------------
              precision    recall  f1-score   support

           0       0.66      0.58      0.61    273213
           1       0.79      0.84      0.82    5

#### Visualizations

In [39]:
#get list of PK
#pks = df.groupby('payment_channel')['is_rec'].sum().sort_values(ascending=False)
#pks = pks.index.values.tolist() 

In [40]:
#plot some samples
#for i in pks[0:15]:
#    t = df.loc[(df.payment_channel == i)].copy()
#    t[t.amount > 0]
#    #t.amount_lvl =  t.amount_lvl.astype(str)
#
#    print('...............................')
#    print(f' primary key: {i}')
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_yousfi'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_graph_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_dbscan'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()

## Experiment 2: Merging stable series

In [41]:
df = stable_channels.copy()#.drop('group_position', axis=1)
df = df.drop(['group_position','datediff', 'day'], axis=1)

In [42]:
%%capture
noised_df = noiser.add_noise_update_exp2(
                            dataframe=df.copy(),
                            col_pk=var_pk,
                            col_amnt=var_amnt,
                            col_date=var_date,
                            prob_combination=1,
                            noise_type = 'combine')

print("Computing diffdays...")
# compute diffdays
lelele = noised_df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()

#noised_df = noised_df.join(lelele.rename('datediff'))
noised_df = pd.merge(noised_df, lelele.rename('datediff'), left_index=True, right_index=True, how='left')
#dup_cols = ['trans_id', 'date', 'amount'] if 'trans_id' in noised_df.columns else ['date', 'amount']
#noised_df.drop_duplicates(subset=dup_cols, keep='first', inplace=True, ignore_index=True)


noised_df = noised_df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = noised_df.groupby(['payment_channel']).cumcount().rename('group_position')
noised_df = noised_df.drop('group_position', axis=1).join(posid)

n_orig = noised_df.groupby('payment_channel')['payment_channel_ori'].nunique().sort_values()
noised_df = noised_df.loc[noised_df.payment_channel.isin(set(n_orig[n_orig > 1].index.tolist()))]


# Execute only when combining series
assert np.isclose(df.payment_channel.nunique() / 2, noised_df.payment_channel.nunique(), atol=2), "This should be half the first"

### Assesment Exp 2

In [43]:
df = noised_df.copy()

# Save original label
df['is_rec_orii'] =  df.is_rec

df.is_rec.value_counts() / len(df)

1    1.0
Name: is_rec, dtype: float64

#### DBSCAN

In [44]:
df['is_rec_ori'] = df['is_rec']

In [45]:
df = dbscanmethod.main_dbscan_method(df)

df['cluster_id_dbscan'] = df['amount_lvl']*10 + df['day_cluster']  
df['cluster_id_dbscan'] = df.apply(lambda x: -1 if x['is_rec']==0 else x['cluster_id_dbscan'], axis=1)
#df = df.drop(['day_cluster','amount_lvl'], axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38802/38802 [01:33<00:00, 416.91it/s]


#### Matrix

In [46]:
%%time
def matrix_flag(data, use_dbscan=False):
    dates = data[var_date].values
    diff_days = data[var_date].diff(1).dt.days.dropna().values
    amounts = data[var_amnt].values
    orders = data.group_position.values

    subseries = matrixmethod.main_matrix_method(diff_days, amounts, use_dbscan=use_dbscan)
    
    flags = np.ones(len(data)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return pd.Series({'cluster_id_matrix_binning': flags, 'group_position': orders})


bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_flag, use_dbscan=False)\
    .explode(['cluster_id_matrix_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19958/19958 [00:50<00:00, 395.02it/s]


CPU times: user 49.7 s, sys: 605 ms, total: 50.3 s
Wall time: 50.8 s


#### Graphs

In [47]:
%%time

def flag_matrix(dates, amounts):    
    dates = np.array(dates).astype(np.datetime64)
    amounts = np.array(amounts)

    datediffs_to_previous = np.diff(dates).astype('timedelta64[D]')
    datediffs_to_previous = datediffs_to_previous/ np.timedelta64(1, 'D')

    subseries = graphmethod.main_matrix_method_graphs(datediffs_to_previous, amounts, use_dbscan=False)

    flags = np.ones(len(dates)) * -1
    
    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return flags
    
def matrix_udf_graphs(data):
    dates = data.date.values
    amounts = data.amount.values
    orders = data.group_position.values
    
    subseries_ids = flag_matrix(dates, amounts)
    
    return pd.Series({'cluster_id_matrix_graph_binning': subseries_ids, 'group_position': orders})
    

bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_udf_graphs)\
    .explode(['cluster_id_matrix_graph_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19958/19958 [03:31<00:00, 94.37it/s]


CPU times: user 3min 27s, sys: 1.23 s, total: 3min 28s
Wall time: 3min 31s


#### Yousfi (baseline)

In [48]:
%%capture

# YOUSFI  ######################################################################
class Config:
    def __init__(self):
        self.type_col='type'
        self.client_col= 'payment_channel'
        self.customer_id= 'payment_channel'
        self.time_col=  'date'
        self.amount_col='amount'
        self.trans_amount='amount'
        self.trans_date=  'date'
        self.trans_type=  'type'


config = Config()

def get_fn(df):
    dfs =  list((DetectRecurrencyII(
                  trans_data = df,
                  client_col= 'payment_channel',
                  time_col=  'date',
                  amount_col='amount',
                  config=config
                  )
           )[1].values())

    #import pdb; pdb.set_trace()
    # Add a cluster_id col for all dfs
    try:
        dfs = [dfs[i].assign(cluster_id = i).reset_index()  for i, d in enumerate(dfs)]
    except:
        import pdb; pdb.set_trace()

    if len(dfs) > 0:
        concat_df = pd.concat(dfs)


        out = pd.merge(df, 
              concat_df, 
             left_on=['date', 'amount'], 
             right_on=['date', 'amount'], how='left')

        dfs = out
    else:
        dfs = df.assign(cluster_id = -1)

    return dfs.loc[:, ['date', 'amount', 'cluster_id']].reset_index(drop=True)    



dft = df.copy()

outt = dft.sort_values(['payment_channel', 'date']).groupby('payment_channel').apply(lambda x: get_fn(x))
dft = pd.merge(dft, outt.reset_index().drop('level_1', axis=1), on=('payment_channel', 'date', 'amount'))
dft['cluster_id'] = dft['cluster_id'].fillna(-1).astype(int)

df['cluster_id_yousfi'] = dft.cluster_id

#### Metrics

In [49]:
# Compute the total number of series for each algorithm, for each PK (i.e. payment channel)
df_agg = pd.DataFrame(df.groupby([var_pk], as_index=False)[var_date].nunique( ))
df_agg.columns=[var_pk,'counts']
matrix_graph_binning = pd.DataFrame(df[df['cluster_id_matrix_graph_binning']>-1].groupby([var_pk],as_index=False)['cluster_id_matrix_graph_binning'].nunique())
matrix_graph_binning.columns=[var_pk,'matrix_graph_binning']
matrix_binning = pd.DataFrame(df[df['cluster_id_matrix_binning']>-1].groupby([var_pk],as_index=False)['cluster_id_matrix_binning'].nunique())
matrix_binning.columns=[var_pk,'matrix_binning']
dbscan = pd.DataFrame(df[df['cluster_id_dbscan']>-1].groupby([var_pk],as_index=False)['cluster_id_dbscan'].nunique())
dbscan.columns=[var_pk,'dbscan']
yousfi = pd.DataFrame(df[df['cluster_id_yousfi']>-1].groupby([var_pk],as_index=False)['cluster_id_yousfi'].nunique())
yousfi.columns=[var_pk,'yousfi']

print(df_agg.shape,matrix_graph_binning.shape,matrix_binning.shape,dbscan.shape)

(19958, 2) (19958, 2) (19956, 2) (19958, 2)


In [50]:
# Compute the exact match of 2 identified clusters (the closer to 1, the better)
print("Percent Exact splitting to two series")
df_all = df_agg.merge(matrix_graph_binning, on=[var_pk], how='left')
df_all = df_all.merge(matrix_binning, on=[var_pk], how='left')
df_all = df_all.merge(dbscan, on=[var_pk], how='left')
df_all = df_all.merge(yousfi, on=[var_pk], how='left')

df_all['match_matrix_graph_binning'] = df_all['matrix_graph_binning'].apply(lambda x: 1 if x==2 else 0)
df_all['match_matrix_binning'] = df_all['matrix_binning'].apply(lambda x: 1 if x==2 else 0)
df_all['match_dbscan'] = df_all['dbscan'].apply(lambda x: 1 if x==2 else 0)
df_all['match_yousfi'] = df_all['yousfi'].apply(lambda x: 1 if x==2 else 0)
df_all[['match_matrix_graph_binning',
       'match_matrix_binning', 'match_dbscan', 'match_yousfi']].describe().loc['mean']

Percent Exact splitting to two series


match_matrix_graph_binning    0.907556
match_matrix_binning          0.705031
match_dbscan                  0.942128
match_yousfi                  0.042088
Name: mean, dtype: float64

In [51]:
# For each of the original subseries, we set the "predicted cluster" as the maximum number of predictions on that subserie. 
# I.e. If a subseries gets the most of its points predicted as cluster_id = 2, then the subseries will be considered cluster = 2 and 
# the overlap amount will be measured with that cluster in mind (number of preds = 2 / total number of points in that suberie). 
# The same will be done with the other subserie.
# It can be the case in which both subseries are predicted within the same cluster_id. In that case, the overlap metric will be: 
#      number_of_ones in the shorter subserie / number of points on the entire (merged) serie.
# the final metric is the weighted average between the above two cases.

cluster_modes = df.groupby('payment_channel_ori').agg({'cluster_id_dbscan': lambda x: x.value_counts().index[0], 'cluster_id_matrix_binning': lambda x: x.value_counts().index[0], 'cluster_id_matrix_graph_binning': lambda x: x.value_counts().index[0], 'cluster_id_yousfi': lambda x: x.value_counts().index[0] })
cluster_modes.columns = ['mode_cluster_id_dbscan', 'mode_cluster_id_matrix_binning', 'mode_cluster_id_matrix_graph_binning', 'mode_cluster_id_yousfi']

dff = df.copy()

dff = dff.join(cluster_modes, on='payment_channel_ori')

dff = dff.join(
    dff.groupby('payment_channel_ori')['payment_channel_ori'].count().rename('series_len'), on='payment_channel_ori'
)

for m in ('dbscan', 'matrix_binning', 'matrix_graph_binning', 'yousfi'):
    uniques = dff.groupby('payment_channel')[f'cluster_id_{m}'].nunique()
    uniques = uniques[uniques < 2]

    metric_oks = (dff.loc[~dff.payment_channel.isin(set(uniques.index.tolist())), f'mode_cluster_id_{m}']  == dff.loc[~dff.payment_channel.isin(set(uniques.index.tolist())), f'cluster_id_{m}']).mean()
    metric_no_oks = (dff.loc[dff.payment_channel.isin(set(uniques.index.tolist()))].groupby('payment_channel')['series_len'].min() / dff.loc[dff.payment_channel.isin(set(uniques.index.tolist()))].groupby('payment_channel')['payment_channel'].count()).mean()
    if np.isnan(metric_no_oks):
        metric_no_oks = 0
    
    #weighted average
    proportion_no_oks = len(uniques) / dff.payment_channel.nunique()
    avg = metric_oks * (1 - proportion_no_oks) + metric_no_oks * proportion_no_oks
    print(f"Avg Matching points to original subseries for method {m} = {avg}")

Avg Matching points to original subseries for method dbscan = 0.9744693202917146
Avg Matching points to original subseries for method matrix_binning = 0.872627052732988
Avg Matching points to original subseries for method matrix_graph_binning = 0.9545087164992907
Avg Matching points to original subseries for method yousfi = 0.584198142122758


In [52]:
print('Avg. Splitting of the series (2 is the optimum value)')

unique_ignoring = lambda x: len(np.unique(x[x>=0]))
unique_ignoring.__name__ = 'unique_ignoring'

df.groupby(var_pk).agg({'cluster_id_matrix_graph_binning': unique_ignoring, 
                                  'cluster_id_matrix_binning': unique_ignoring,
                                  'cluster_id_dbscan': unique_ignoring,
                                  'cluster_id_yousfi': unique_ignoring
                       }).mean()

Avg. Splitting of the series (2 is the optimum value)


cluster_id_matrix_graph_binning    2.045546
cluster_id_matrix_binning          2.131727
cluster_id_dbscan                  1.968584
cluster_id_yousfi                  0.438922
dtype: float64

#### Visualizations

In [53]:
#get list of PK
#pks = df.groupby('payment_channel')['is_rec'].sum().sort_values(ascending=False)
#pks = pks.index.values.tolist() 

In [54]:
#df['day'] = df['date'].apply(lambda r:r.day)

In [55]:
##plot some samples
#for i in pks[0:15]:
#    t = df.loc[(df.payment_channel == i)].copy()
#    t[t.amount > 0]
#    #t.amount_lvl =  t.amount_lvl.astype(str)
#
#    print('...............................')
#    print(f' primary key: {i}')
#    days = t['day'].values.tolist() 
#    diffdays = t['date_diffdays'].values.tolist()
#
#    print(f'day of the month array: {days}')
#    print(f'diff days: {diffdays}')
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='payment_channel_ori',
#                    hue='cluster_id_yousfi'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='payment_channel_ori',
#                    hue='cluster_id_matrix_graph_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='payment_channel_ori',
#                    hue='cluster_id_matrix_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='payment_channel_ori',
#                    hue='cluster_id_dbscan'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()

## Experiment 3.1: Perturb points in the original stable series

Only move the already existing points

In [56]:
df = stable_channels.copy()#.drop('group_position', axis=1)
df = df.drop(['group_position','datediff', 'day'], axis=1)
df.shape

(436615, 4)

In [57]:
%%time
%%capture
noised_df = noiser.add_noise_update_exp3(
                            dataframe=df.copy(),
                            col_pk=var_pk,
                            col_amnt=var_amnt,
                            col_date=var_date,
                            run_noise_1=False)  # Do not add new noise

noised_df = noised_df.copy()#.drop(['datediff', 'group_position'], axis=1)

print("Computing diffdays...")
# compute diffdays
lelele = noised_df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()

#noised_df = noised_df.join(lelele.rename('datediff'))
noised_df = pd.merge(noised_df, lelele.rename('datediff'), left_index=True, right_index=True, how='left')
dup_cols = ['trans_id', 'date', 'amount'] if 'trans_id' in noised_df.columns else ['date', 'amount']
noised_df.drop_duplicates(subset=dup_cols, keep='first', inplace=True, ignore_index=True)


noised_df = noised_df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = noised_df.groupby(['payment_channel']).cumcount().rename('group_position')
noised_df = noised_df.join(posid)

CPU times: user 17.9 s, sys: 412 ms, total: 18.3 s
Wall time: 18.8 s


In [58]:
noised_df.is_rec.value_counts() / len(noised_df)

1    0.830246
0    0.169754
Name: is_rec, dtype: float64

#### DBSCAN

In [59]:
df = noised_df.copy()

# Save original label
df['is_rec_orii'] =  df.is_rec

df.is_rec.value_counts() / len(df)

1    0.830246
0    0.169754
Name: is_rec, dtype: float64

In [60]:
%%time
df = dbscanmethod.main_dbscan_method(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32008/32008 [00:32<00:00, 975.00it/s]

CPU times: user 36.7 s, sys: 1.38 s, total: 38 s
Wall time: 37.8 s





In [61]:
df['cluster_id_dbscan'] = df['is_rec']
df['is_rec'] = df['is_rec_orii']
#df = df.drop('is_rec_ori', axis=1)

#### Matrix 

In [62]:
%%time
def matrix_flag(data, use_dbscan=False):
    dates = data[var_date].values
    diff_days = data[var_date].diff(1).dt.days.dropna().values
    amounts = data[var_amnt].values
    orders = data.group_position.values

    subseries = matrixmethod.main_matrix_method(diff_days, amounts, use_dbscan=use_dbscan)
    
    flags = np.ones(len(data)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return pd.Series({'cluster_id_matrix_binning': flags, 'group_position': orders})


bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_flag, use_dbscan=False)\
    .explode(['cluster_id_matrix_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])

df.loc[df.cluster_id_matrix_binning >= 0, 'cluster_id_matrix_binning'] = 1
df.loc[df.cluster_id_matrix_binning < 0, 'cluster_id_matrix_binning'] = 0

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27521/27521 [00:18<00:00, 1518.57it/s]

CPU times: user 17.8 s, sys: 233 ms, total: 18 s
Wall time: 18.3 s





#### Graphs

In [63]:
%%time
def flag_matrix(dates, amounts):    
    dates = np.array(dates).astype(np.datetime64)
    amounts = np.array(amounts)
    
    datediffs_to_previous = np.diff(dates).astype('timedelta64[D]')
    datediffs_to_previous = datediffs_to_previous/ np.timedelta64(1, 'D')
    subseries = graphmethod.main_matrix_method_graphs(datediffs_to_previous, amounts, use_dbscan=False)

    flags = np.ones(len(dates)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return flags
    
def matrix_udf_graphs(data):
    dates = data.date.values
    amounts = data.amount.values
    orders = data.group_position.values
    
    subseries_ids = flag_matrix(dates, amounts)
    
    return pd.Series({'cluster_id_matrix_graph_binning': subseries_ids, 'group_position': orders})

bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_udf_graphs)\
    .explode(['cluster_id_matrix_graph_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])
df.loc[df.cluster_id_matrix_graph_binning >= 0, 'cluster_id_matrix_graph_binning'] = 1
df.loc[df.cluster_id_matrix_graph_binning < 0, 'cluster_id_matrix_graph_binning'] = 0

print(classification_report(df.is_rec.values, df.cluster_id_matrix_graph_binning.values.astype(int)))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27521/27521 [00:15<00:00, 1756.42it/s]

              precision    recall  f1-score   support

           0       0.51      0.97      0.66     21876
           1       0.99      0.81      0.89    106993

    accuracy                           0.83    128869
   macro avg       0.75      0.89      0.78    128869
weighted avg       0.91      0.83      0.85    128869

CPU times: user 15.6 s, sys: 240 ms, total: 15.8 s
Wall time: 15.9 s





#### Yousfi (baseline)

In [64]:
%%capture
class Config:
    def __init__(self):
        self.type_col='type'
        self.client_col= 'payment_channel'
        self.customer_id= 'payment_channel'
        self.time_col=  'date'
        self.amount_col='amount'
        self.trans_amount='amount'
        self.trans_date=  'date'
        self.trans_type=  'type'


config = Config()

def get_fn(df):
    dfs =  list((DetectRecurrencyII(
                  trans_data = df,
                  client_col= 'payment_channel',
                  time_col=  'date',
                  amount_col='amount',
                  config=config
                  )
           )[1].values())

    #import pdb; pdb.set_trace()
    # Add a cluster_id col for all dfs
    try:
        dfs = [dfs[i].assign(cluster_id = i).reset_index()  for i, d in enumerate(dfs)]
    except:
        import pdb; pdb.set_trace()

    if len(dfs) > 0:
        concat_df = pd.concat(dfs)


        out = pd.merge(df, 
              concat_df, 
             left_on=['date', 'amount'], 
             right_on=['date', 'amount'], how='left')

        dfs = out
    else:
        dfs = df.assign(cluster_id = -1)

    return dfs.loc[:, ['date', 'amount', 'cluster_id']].reset_index(drop=True)    



dft = df.copy()

outt = dft.sort_values(['payment_channel', 'date']).groupby('payment_channel').apply(lambda x: get_fn(x))
dft = pd.merge(dft, outt.reset_index().drop('level_1', axis=1), on=('payment_channel', 'date', 'amount'))
dft['cluster_id'] = dft['cluster_id'].fillna(-1).astype(int)

df['cluster_id_yousfi'] = dft.cluster_id

df.loc[df['cluster_id_yousfi'] >= 0, 'cluster_id_yousfi'] = 1
df.loc[df['cluster_id_yousfi'] < 0, 'cluster_id_yousfi'] = 0

### Metrics

In [65]:
for method in ('cluster_id_matrix_graph_binning', 'cluster_id_matrix_binning', 'cluster_id_dbscan', 'cluster_id_yousfi'):
    print(f"REPORT FOR METHOD: {method} -----------------")
    print(classification_report(df.is_rec.values, df[method].values.astype(int)))

    print()

REPORT FOR METHOD: cluster_id_matrix_graph_binning -----------------
              precision    recall  f1-score   support

           0       0.51      0.97      0.66     21876
           1       0.99      0.81      0.89    106993

    accuracy                           0.83    128869
   macro avg       0.75      0.89      0.78    128869
weighted avg       0.91      0.83      0.85    128869


REPORT FOR METHOD: cluster_id_matrix_binning -----------------
              precision    recall  f1-score   support

           0       0.38      0.96      0.54     21876
           1       0.99      0.68      0.81    106993

    accuracy                           0.73    128869
   macro avg       0.68      0.82      0.67    128869
weighted avg       0.88      0.73      0.76    128869


REPORT FOR METHOD: cluster_id_dbscan -----------------
              precision    recall  f1-score   support

           0       0.63      1.00      0.77     21876
           1       1.00      0.88      0.94    1

### Visualizations
 

In [66]:
#get list of PK
#pks = df.groupby('payment_channel')['is_rec'].sum().sort_values(ascending=False)
#pks = pks.index.values.tolist() 

In [67]:
#df['day'] = df['date'].apply(lambda r:r.day)

In [68]:
#plot some samples
#for i in pks[0:15]:
#    t = df.loc[(df.payment_channel == i)].copy()
#    t[t.amount > 0]
#    #t.amount_lvl =  t.amount_lvl.astype(str)
#
#
#    print('...............................')
#    print(f' primary key: {i}')
#    days = t['day'].values.tolist() 
#    diffdays = t['date_diffdays'].values.tolist()
#
#    print(f'day of the month array: {days}')
#    print(f'diff days: {diffdays}')
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_yousfi'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"upper left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_graph_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"upper left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"upper left")
#    plt.show()
#
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_dbscan'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"upper left")
#    plt.show()

## Experiment 3.2: Perturb points in the original stable series

Move the already existing points AND add new noise

In [69]:
df = stable_channels.copy().drop('group_position', axis=1)

In [70]:
#print(stable_channels[stable_channels['payment_channel']=='7445-21391058-PREVOD NA UCET-VYDAJ'].shape)
#df1 = df[['payment_channel','date','amount']]
#df1[df1['payment_channel']=='7445-21391058-PREVOD NA UCET-VYDAJ'].head(40)

In [71]:
%%time
%%capture
noised_df = noiser.add_noise_update_exp3(
                            dataframe=df.copy(),
                            col_pk=var_pk,
                            col_amnt=var_amnt,
                            col_date=var_date,
                            run_noise_1=True)  # add new noise

noised_df = noised_df.copy()#.drop(['datediff', 'group_position'], axis=1)

print("Computing diffdays...")
# compute diffdays
lelele = noised_df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()

#noised_df = noised_df.join(lelele.rename('datediff'))
noised_df = pd.merge(noised_df, lelele.rename('datediff'), left_index=True, right_index=True, how='left')
#dup_cols = ['trans_id', 'date', 'amount'] if 'trans_id' in noised_df.columns else ['date', 'amount']
#noised_df.drop_duplicates(subset=dup_cols, keep='first', inplace=True, ignore_index=True)

noised_df = noised_df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = noised_df.groupby(['payment_channel']).cumcount().rename('group_position')
noised_df = noised_df.join(posid)

CPU times: user 26.1 s, sys: 614 ms, total: 26.7 s
Wall time: 27 s


In [72]:
noised_df.is_rec.value_counts() / len(noised_df)

1    0.839926
0    0.160074
Name: is_rec, dtype: float64

#### DBSCAN

In [73]:
df = noised_df.copy()


# Save original label
df['is_rec_orii'] =  df.is_rec

df.is_rec.value_counts() / len(df)

1    0.839926
0    0.160074
Name: is_rec, dtype: float64

In [74]:
%%time
df = dbscanmethod.main_dbscan_method(df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74140/74140 [02:01<00:00, 607.86it/s]


CPU times: user 2min 3s, sys: 4.76 s, total: 2min 8s
Wall time: 2min 9s


In [75]:
df['cluster_id_dbscan'] = df['is_rec']
df['is_rec'] = df['is_rec_orii']

#### Matrix 

In [76]:
%%time
def matrix_flag(data, use_dbscan=False):
    dates = data[var_date].values
    diff_days = data[var_date].diff(1).dt.days.dropna().values
    amounts = data[var_amnt].values
    orders = data.group_position.values

    subseries = matrixmethod.main_matrix_method(diff_days, amounts, use_dbscan=use_dbscan)
    
    flags = np.ones(len(data)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return pd.Series({'cluster_id_matrix_binning': flags, 'group_position': orders})


bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_flag, use_dbscan=False)\
    .explode(['cluster_id_matrix_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])

df.loc[df.cluster_id_matrix_binning >= 0, 'cluster_id_matrix_binning'] = 1
df.loc[df.cluster_id_matrix_binning < 0, 'cluster_id_matrix_binning'] = 0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39916/39916 [00:53<00:00, 744.24it/s]


CPU times: user 52 s, sys: 507 ms, total: 52.5 s
Wall time: 54.1 s


#### Graphs

In [77]:
%%time
def flag_matrix(dates, amounts):    
    dates = np.array(dates).astype(np.datetime64)
    amounts = np.array(amounts)
    
    datediffs_to_previous = np.diff(dates).astype('timedelta64[D]')
    datediffs_to_previous = datediffs_to_previous/ np.timedelta64(1, 'D')
    subseries = graphmethod.main_matrix_method_graphs(datediffs_to_previous, amounts, use_dbscan=False)

    flags = np.ones(len(dates)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return flags
    
def matrix_udf_graphs(data):
    dates = data.date.values
    amounts = data.amount.values
    orders = data.group_position.values
    
    subseries_ids = flag_matrix(dates, amounts)
    
    return pd.Series({'cluster_id_matrix_graph_binning': subseries_ids, 'group_position': orders})

bin_nbs = df.sort_values([var_pk, var_date])\
    .groupby([var_pk])\
    .progress_apply(matrix_udf_graphs)\
    .explode(['cluster_id_matrix_graph_binning', 'group_position'])

df = pd.merge(df, bin_nbs, on=[var_pk, 'group_position'])
df.loc[df.cluster_id_matrix_graph_binning >= 0, 'cluster_id_matrix_graph_binning'] = 1
df.loc[df.cluster_id_matrix_graph_binning < 0, 'cluster_id_matrix_graph_binning'] = 0



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39916/39916 [01:37<00:00, 411.49it/s]


CPU times: user 1min 35s, sys: 914 ms, total: 1min 36s
Wall time: 1min 37s


#### Yousfi (baseline)

In [78]:
%%capture
class Config:
    def __init__(self):
        self.type_col='type'
        self.client_col= 'payment_channel'
        self.customer_id= 'payment_channel'
        self.time_col=  'date'
        self.amount_col='amount'
        self.trans_amount='amount'
        self.trans_date=  'date'
        self.trans_type=  'type'


config = Config()

def get_fn(df):
    dfs =  list((DetectRecurrencyII(
                  trans_data = df,
                  client_col= 'payment_channel',
                  time_col=  'date',
                  amount_col='amount',
                  config=config
                  )
           )[1].values())

    #import pdb; pdb.set_trace()
    # Add a cluster_id col for all dfs
    try:
        dfs = [dfs[i].assign(cluster_id = i).reset_index()  for i, d in enumerate(dfs)]
    except:
        import pdb; pdb.set_trace()

    if len(dfs) > 0:
        concat_df = pd.concat(dfs)


        out = pd.merge(df, 
              concat_df, 
             left_on=['date', 'amount'], 
             right_on=['date', 'amount'], how='left')

        dfs = out
    else:
        dfs = df.assign(cluster_id = -1)

    return dfs.loc[:, ['date', 'amount', 'cluster_id']].reset_index(drop=True)    



dft = df.copy()

outt = dft.sort_values(['payment_channel', 'date']).groupby('payment_channel').apply(lambda x: get_fn(x))
dft = pd.merge(dft, outt.reset_index().drop('level_1', axis=1), on=('payment_channel', 'date', 'amount'))
dft['cluster_id'] = dft['cluster_id'].fillna(-1).astype(int)

df['cluster_id_yousfi'] = dft.cluster_id

df.loc[df['cluster_id_yousfi'] >= 0, 'cluster_id_yousfi'] = 1
df.loc[df['cluster_id_yousfi'] < 0, 'cluster_id_yousfi'] = 0

#### Metrics

In [79]:
for method in ('cluster_id_matrix_graph_binning', 'cluster_id_matrix_binning', 'cluster_id_dbscan', 'cluster_id_yousfi'):
    print(f"REPORT FOR METHOD: {method} -----------------")
    print(classification_report(df.is_rec.values, df[method].values.astype(int)))

    print()

REPORT FOR METHOD: cluster_id_matrix_graph_binning -----------------
              precision    recall  f1-score   support

           0       0.64      0.91      0.75     83372
           1       0.98      0.90      0.94    437463

    accuracy                           0.91    520835
   macro avg       0.81      0.91      0.85    520835
weighted avg       0.93      0.91      0.91    520835


REPORT FOR METHOD: cluster_id_matrix_binning -----------------
              precision    recall  f1-score   support

           0       0.47      0.93      0.62     83372
           1       0.98      0.80      0.88    437463

    accuracy                           0.82    520835
   macro avg       0.73      0.86      0.75    520835
weighted avg       0.90      0.82      0.84    520835


REPORT FOR METHOD: cluster_id_dbscan -----------------
              precision    recall  f1-score   support

           0       0.71      0.85      0.77     83372
           1       0.97      0.93      0.95    4

### Visualizations

In [80]:
#get list of PK
#pks = df.groupby('payment_channel')['is_rec'].sum().sort_values(ascending=False)
#pks = pks.index.values.tolist() 

In [81]:
##plot some samples
#for i in ['7445-21391058-PREVOD NA UCET-VYDAJ']+pks[0:15]:
#    t = df.loc[(df.payment_channel == i)].copy()
#    t[t.amount > 0]
#    #t.amount_lvl =  t.amount_lvl.astype(str)
#
#    print('...............................')
#    print(f' primary key: {i}')
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_yousfi'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_graph_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_matrix_binning'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()
#
#    
#    ax = sns.scatterplot(t, 
#                    x='date', 
#                    y='amount', 
#                    style='is_rec_orii',
#                    hue='cluster_id_dbscan'
#                   )
#    ax.set(xticklabels=[]) 
#    ax.set(xlabel=None)
#    ax.tick_params(bottom=False)
#    sns.move_legend( ax,"center left")
#    plt.show()