In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
BASE_DATA_PATH = 'trans.asc'

In [3]:
import dbscanmethod
import graphsmethod as graphmethod
import matrixmethod
from yousi import DetectRecurrencyII

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
df = pd.read_table(BASE_DATA_PATH, sep=';', low_memory=False)

In [5]:
df = df.loc[~df.account.isna()] 

In [6]:
df.loc[:, 'account'] = df.loc[:, 'account'].astype(int)

  df.loc[:, 'account'] = df.loc[:, 'account'].astype(int)


In [7]:
df['payment_channel'] = df.account_id.astype(str) + '-' + df.account.astype(str) +'-' + df.operation.astype(str) + '-' + df.type.astype(str)
df['date'] =pd.to_datetime(df['date'].astype(str), format='%y%m%d')
#df['amount'] = df['amount'].abs().astype(float)
#df['is_rec'] = 0

In [8]:
print(f"Taking last {100} of each payment_channel...")
df = df.groupby('payment_channel').tail(100)

# Remove payment channels with too few transactions
cnts = df.groupby('payment_channel')['payment_channel'].count().sort_values(ascending=True)
df = df.loc[df['payment_channel'].isin(cnts[cnts > 4].index)]

Taking last 100 of each payment_channel...


In [9]:
print("Computing diffdays...")
# compute diffdays
lelele = df.sort_values(['payment_channel', 'date'])\
    .groupby('payment_channel')['date']\
    .diff(1).dt.days\
    .fillna(0)\
    .abs()

df = df.join(lelele.rename('datediff'))

Computing diffdays...


In [10]:
df = df.sort_values(['payment_channel', 'date']).reset_index(drop=True)
posid = df.groupby(['payment_channel']).cumcount().rename('group_position')
df = df.join(posid)

In [11]:
df_ori = df.copy()

### Filtering methods. (See comments for knowing what method is being executed)

In [12]:
%%time
#DBSCAN  ######################################################################
df = dbscanmethod.main_dbscan_method(df_ori, eps_date=.05, eps_amount=.1)
df['cluster_id'] = (df['amount_lvl'].astype(int).astype(str)) + '-' +  df['day_cluster'].astype(int).astype(str)

print(f"Percentage of non-recurrent points using DBSCAN: {len(df[df.is_rec == 0]) / len(df)}")

df.to_parquet('filtered_dbscan.pqt')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10173/10173 [01:38<00:00, 103.30it/s]


Percentage of non-recurrent points using DBSCAN: 0.04511679447435688
CPU times: user 1min 38s, sys: 3.69 s, total: 1min 41s
Wall time: 1min 41s


In [13]:
%%time
# GRAPHS  ######################################################################
def flag_matrix(dates, amounts):    
    dates = np.array(dates).astype(np.datetime64)
    amounts = np.array(amounts)
    
    datediffs_to_previous = np.diff(dates).astype('timedelta64[D]')
    datediffs_to_previous = datediffs_to_previous/ np.timedelta64(1, 'D')
    subseries = graphmethod.main_matrix_method_graphs(datediffs_to_previous, amounts, use_dbscan=False)

    flags = np.ones(len(dates)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return flags

    
def matrix_udf_graphs(data):
    dates = data.date.values
    amounts = data.amount.values
    orders = data.group_position.values
    
    subseries_ids = flag_matrix(dates, amounts)
    
    return pd.Series({'cluster_id': subseries_ids, 'group_position': orders})


bin_nbs = df_ori.sort_values(['payment_channel', 'date'])\
    .groupby(['payment_channel'])\
    .apply(matrix_udf_graphs)\
    .explode(['cluster_id', 'group_position'])

df = pd.merge(df_ori, bin_nbs, on=['payment_channel', 'group_position'])
df['cluster_id'] = df.cluster_id.astype(int)
df['is_rec'] = (df['cluster_id'] >= 0).astype(int)


print(f"Percentage of non-recurrent points using GRAPH: {len(df[df.is_rec == 0]) / len(df)}")

df.to_parquet('filtered_graphs.pqt')

Percentage of non-recurrent points using GRAPH: 0.02225077500407897
CPU times: user 2min 25s, sys: 927 ms, total: 2min 26s
Wall time: 2min 28s


In [14]:
%%time

# MATRIX  ######################################################################
def matrix_flag(data):
    diff_days = data['datediff'].values[1:]
    #diff_days = data['date'].diff(1).dt.days.dropna().values
    amounts = data['amount'].values
    orders = data.group_position.values
    
    #subseries = split_series_cluster(diff_days, amounts, dates, centroids)

    subseries = matrixmethod.main_matrix_method(diff_days, amounts, use_dbscan=False)
    
    flags = np.ones(len(data)) * -1

    # Return array with subseries ids (eg: [0,1,1,1,1,1,0,0,2,0,2,2,2,2])
    l = list(enumerate(subseries))
    for i, indices in l:
        flags[indices] = i

    return pd.Series({'cluster_id': flags, 'group_position': orders})

# Matrix Method
bin_nbs = df_ori.sort_values(['payment_channel', 'date'])\
    .groupby(['payment_channel'])\
    .apply(matrix_flag)\
    .explode(['cluster_id', 'group_position'])

df = pd.merge(df_ori, bin_nbs, on=['payment_channel', 'group_position'])
df['cluster_id'] = df.cluster_id.astype(int)
df['is_rec'] = (df['cluster_id'] >= 0).astype(int)

print(f"Percentage of non-recurrent points using MATRIX: {len(df[df.is_rec == 0]) / len(df)}")

df.to_parquet('filtered_matrix.pqt')

Percentage of non-recurrent points using MATRIX: 0.050728775765486483
CPU times: user 55.4 s, sys: 763 ms, total: 56.2 s
Wall time: 57.2 s


In [15]:
# YOUSFI  ######################################################################

class Config:
    def __init__(self):
        self.type_col='type'
        self.client_col= 'payment_channel'
        self.customer_id= 'payment_channel'
        self.time_col=  'date'
        self.amount_col='amount'
        self.trans_amount='amount'
        self.trans_date=  'date'
        self.trans_type=  'type'


config = Config()

def get_fn(df):
    dfs =  list((DetectRecurrencyII(
                  trans_data = df,
                  client_col= 'payment_channel',
                  time_col=  'date',
                  amount_col='amount',
                  config=config
                  )
           )[1].values())

    #import pdb; pdb.set_trace()
    # Add a cluster_id col for all dfs
    try:
        dfs = [dfs[i].assign(cluster_id = i).reset_index()  for i, d in enumerate(dfs)]
    except:
        import pdb; pdb.set_trace()

    if len(dfs) > 0:
        concat_df = pd.concat(dfs)


        out = pd.merge(df, 
              concat_df, 
             left_on=['date', 'amount'], 
             right_on=['date', 'amount'], how='left')

        dfs = out
    else:
        dfs = df.assign(cluster_id = -1)

    return dfs.loc[:, ['date', 'amount', 'cluster_id']].reset_index(drop=True)    

In [16]:
dft = df_ori.copy()

In [17]:
%%capture
outt = dft.sort_values(['payment_channel', 'date']).groupby('payment_channel').apply(lambda x: get_fn(x))

In [18]:
dft = pd.merge(dft, outt.reset_index().drop('level_1', axis=1), on=('payment_channel', 'date', 'amount'))

In [19]:
dft['cluster_id'] = dft['cluster_id'].fillna(-1).astype(int)

In [20]:
dft.to_parquet('filtered_yousfi.pqt')

Plot a single example

In [None]:
t = df.loc[(df.payment_channel == '3514-67514222-PREVOD Z UCTU-PRIJEM')].copy()
t[t.amount > 0]
#t.amount_lvl =  t.amount_lvl.astype(str)


sns.scatterplot(t, 
                x='date', 
                y='amount', 
                style='cluster_id',
                hue='cluster_id'#'cluster_id'
               )

**Recognition stats**

In [28]:
for method in ('dbscan', 'matrix', 'graphs', 'yousfi'):
    df = pd.read_parquet(f'filtered_{method}.pqt')

    if 'is_rec' not in df.columns:
        df['is_rec'] = (df['cluster_id'] >= 0).astype(int)

    total_rec_series = df[df.is_rec > 0].groupby(['payment_channel', 'cluster_id'])['payment_channel'].nunique().sum()
    
    print(f"[{method.upper()}] Percentage of payment channels with some recurrent sub-pattern: {df[df.is_rec > 0].payment_channel.nunique() / df.payment_channel.nunique()} \t Total recurrent subseries: {df[df.is_rec > 0].groupby(['payment_channel', 'cluster_id'])['payment_channel'].nunique().sum()}")

[DBSCAN] Percentage of payment channels with some recurrent sub-pattern: 0.9162387310281866 	 Total recurrent subseries: 9071
[MATRIX] Percentage of payment channels with some recurrent sub-pattern: 0.9535547187036403 	 Total recurrent subseries: 10961
[GRAPHS] Percentage of payment channels with some recurrent sub-pattern: 0.9879036859522995 	 Total recurrent subseries: 11837
[YOUSFI] Percentage of payment channels with some recurrent sub-pattern: 0.870478146753395 	 Total recurrent subseries: 7893
