In [5]:
import pandas as pd
import datetime
from datetime import timedelta
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [6]:
df = pd.read_csv("module-2-linux-database-deployments/source/vtf2020_module2_data.csv",  parse_dates=True)

In [7]:
df.count()

userID         2103296
reqDate        2103296
transID        2103296
bankCode       1051478
amount         2103296
transStatus    2103296
type           2103296
dtype: int64

In [8]:
df['reqDate']

0          2018-12-28 00:07:36.302
1          2018-12-28 00:08:35.218
2          2018-12-28 00:27:33.530
3          2018-12-28 01:00:55.220
4          2018-12-28 01:48:51.443
                    ...           
2103291    2018-12-14 13:17:52.888
2103292    2018-12-14 15:41:49.949
2103293    2018-12-30 08:57:41.758
2103294    2018-12-07 12:09:06.035
2103295    2018-12-06 19:37:38.320
Name: reqDate, Length: 2103296, dtype: object

In [9]:
df['reqDate'] = df['reqDate'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f').date())

# Bank Success Rate

In [10]:
df.loc[df['bankCode'] == 'BIDV']

Unnamed: 0,userID,reqDate,transID,bankCode,amount,transStatus,type
899399,b65832ffed27be9673e83962da3da6c8,2018-12-20,181220000142753,BIDV,790000,0,138
1466314,15e59ee761c854df56df4ac4c896ff40,2018-12-24,181224000019983,BIDV,117000,0,24
1520545,30884aff88caa1497b5e248dbc99226e,2018-12-26,181226000126720,BIDV,239000,0,237


In [11]:
sorted_date = sorted(df['reqDate'].unique())

In [12]:
bank_success = df.groupby(['bankCode', 'reqDate']).agg({
    'transStatus': 'sum',
    'transID': 'count'
})

bank_success.columns = ['successTransactions', 'totalTransactions']
bank_success['SuccessRate'] = bank_success['successTransactions'] / bank_success['totalTransactions']
bank_success

Unnamed: 0_level_0,Unnamed: 1_level_0,successTransactions,totalTransactions,SuccessRate
bankCode,reqDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BIDV,2018-12-20,0,1,0.000000
BIDV,2018-12-24,0,1,0.000000
BIDV,2018-12-26,0,1,0.000000
CC,2018-12-26,0,1,0.000000
CC,2018-12-30,0,1,0.000000
...,...,...,...,...
ZPVCCB,2018-12-27,52,91,0.571429
ZPVCCB,2018-12-28,63,75,0.840000
ZPVCCB,2018-12-29,28,29,0.965517
ZPVCCB,2018-12-30,24,28,0.857143


In [26]:
bank_success_reset_index = bank_success.reset_index()
bank_success_reset_index

Unnamed: 0,bankCode,reqDate,successTransactions,totalTransactions,SuccessRate
0,BIDV,2018-12-20,0,1,0.000000
1,BIDV,2018-12-24,0,1,0.000000
2,BIDV,2018-12-26,0,1,0.000000
3,CC,2018-12-26,0,1,0.000000
4,CC,2018-12-30,0,1,0.000000
...,...,...,...,...,...
286,ZPVCCB,2018-12-27,52,91,0.571429
287,ZPVCCB,2018-12-28,63,75,0.840000
288,ZPVCCB,2018-12-29,28,29,0.965517
289,ZPVCCB,2018-12-30,24,28,0.857143


In [28]:
for b in bank_success_reset_index.values:
    print(b)

['BIDV' datetime.date(2018, 12, 20) 0 1 0.0]
['BIDV' datetime.date(2018, 12, 24) 0 1 0.0]
['BIDV' datetime.date(2018, 12, 26) 0 1 0.0]
['CC' datetime.date(2018, 12, 26) 0 1 0.0]
['CC' datetime.date(2018, 12, 30) 0 1 0.0]
['VTB' datetime.date(2018, 12, 22) 0 1 0.0]
['VTB' datetime.date(2018, 12, 25) 0 1 0.0]
['ZPBIDV' datetime.date(2018, 12, 1) 2841 3339 0.8508535489667565]
['ZPBIDV' datetime.date(2018, 12, 2) 2349 2671 0.8794459004118308]
['ZPBIDV' datetime.date(2018, 12, 3) 1935 2713 0.7132325838555105]
['ZPBIDV' datetime.date(2018, 12, 4) 1926 4050 0.47555555555555556]
['ZPBIDV' datetime.date(2018, 12, 5) 3117 5380 0.579368029739777]
['ZPBIDV' datetime.date(2018, 12, 6) 2606 4080 0.6387254901960784]
['ZPBIDV' datetime.date(2018, 12, 7) 3348 4439 0.7542239243072764]
['ZPBIDV' datetime.date(2018, 12, 8) 2887 3679 0.784724109812449]
['ZPBIDV' datetime.date(2018, 12, 9) 2748 3394 0.8096641131408367]
['ZPBIDV' datetime.date(2018, 12, 10) 4405 5337 0.8253700580850665]
['ZPBIDV' datetime.da

# RFM

In [13]:
max_date = max(df['reqDate'])
max_date

datetime.date(2018, 12, 31)

In [14]:
rfm = df.groupby(['userID']).agg({
    'reqDate': lambda x: (max_date + timedelta(1) - x.max()).days,
    'transID': 'count',
    'amount': 'sum'
})

In [15]:
rfm.columns = ['recency', 'frequency', 'monetary']

In [16]:
rfm['r-score'] = pd.qcut(rfm['recency'], q=4, labels=range(4, 0, -1))
rfm['f-score'] = pd.qcut(rfm['frequency'], q=4, labels=range(1, 5, 1))
rfm['m-score'] = pd.qcut(rfm['monetary'], q=4, labels=range(1, 5, 1))
rfm

Unnamed: 0_level_0,recency,frequency,monetary,r-score,f-score,m-score
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000084454cb2ae6275bb479b957d6b7b,18,7,591530,1,3,3
0000c78ffe9aaa92a4694f65e5465b24,11,2,350000,2,1,2
0000cce93f5aa9df5c0f5c9c002e2e75,20,2,1392600,1,1,4
0001081adfa0c6443ef076f85eeeee5c,11,3,383100,2,2,2
000169c037c452a0d0386f9d06933c5a,2,85,28827200,4,4,4
...,...,...,...,...,...,...
ffffa20c3d0c58f41dec3c5e99f848b2,9,2,211000,2,1,2
ffffa3034495807b1f393ef0a090d6e5,7,1,536000,3,1,3
ffffb1a8994d83f712c2819ab5e97173,11,2,12000,2,1,1
ffffe96889018bde56c719ae1b88711d,25,1,485000,1,1,3


In [17]:
rfm['RFM-score'] = rfm['r-score'].astype(str) + rfm['f-score'].astype(str) + rfm['m-score'].astype(str)

In [18]:
rfm

Unnamed: 0_level_0,recency,frequency,monetary,r-score,f-score,m-score,RFM-score
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000084454cb2ae6275bb479b957d6b7b,18,7,591530,1,3,3,133
0000c78ffe9aaa92a4694f65e5465b24,11,2,350000,2,1,2,212
0000cce93f5aa9df5c0f5c9c002e2e75,20,2,1392600,1,1,4,114
0001081adfa0c6443ef076f85eeeee5c,11,3,383100,2,2,2,222
000169c037c452a0d0386f9d06933c5a,2,85,28827200,4,4,4,444
...,...,...,...,...,...,...,...
ffffa20c3d0c58f41dec3c5e99f848b2,9,2,211000,2,1,2,212
ffffa3034495807b1f393ef0a090d6e5,7,1,536000,3,1,3,313
ffffb1a8994d83f712c2819ab5e97173,11,2,12000,2,1,1,211
ffffe96889018bde56c719ae1b88711d,25,1,485000,1,1,3,113


# AppID

In [19]:
appid = df.loc[~df['type'].isin([1,2,3,4])]
appid = appid.groupby(['userID', 'type']).agg({
    'amount': 'sum'
})
appid.reset_index(inplace=True)
appid.head()

Unnamed: 0,userID,type,amount
0,000084454cb2ae6275bb479b957d6b7b,61,391530
1,0000c78ffe9aaa92a4694f65e5465b24,328,250000
2,0000cce93f5aa9df5c0f5c9c002e2e75,118,692600
3,0001081adfa0c6443ef076f85eeeee5c,12,97000
4,0001081adfa0c6443ef076f85eeeee5c,61,286100


# Users Transfer