Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.

SPDX-License-Identifier: Apache-2.0


# Notebook for raw data exploration 
***Please download data*** for financial fraud use case following the instructions in Readme (or *notebooks/download_data.ipynb*) first in order to run all the notebooks related to the financial fraud use case

In [1]:
import pandas as pd
import numpy as np

# Load in raw data

In [2]:
raw_data_path = '../../data/01_raw/financial_fraud/bs140513_032310.csv'

In [3]:
raw_trans_data = pd.read_csv(raw_data_path)

In [4]:
raw_trans_data.shape

(594643, 10)

In [5]:
raw_trans_data.columns

Index(['step', 'customer', 'age', 'gender', 'zipcodeOri', 'merchant',
       'zipMerchant', 'category', 'amount', 'fraud'],
      dtype='object')

### raw trans data has more categorical variables(age, gender. zipcode) for customer and merchant(zip)

In [6]:
raw_trans_data.head(10) 

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
5,0,'C1315400589','3','F','28007','M348934600','28007','es_transportation',25.81,0
6,0,'C765155274','1','F','28007','M348934600','28007','es_transportation',9.1,0
7,0,'C202531238','4','F','28007','M348934600','28007','es_transportation',21.17,0
8,0,'C105845174','3','M','28007','M348934600','28007','es_transportation',32.4,0
9,0,'C39858251','5','F','28007','M348934600','28007','es_transportation',35.4,0


In [7]:
raw_trans_data.describe()

Unnamed: 0,step,amount,fraud
count,594643.0,594643.0,594643.0
mean,94.986827,37.890135,0.012108
std,51.053632,111.402831,0.109369
min,0.0,0.0,0.0
25%,52.0,13.74,0.0
50%,97.0,26.9,0.0
75%,139.0,42.54,0.0
max,179.0,8329.96,1.0


In [8]:
raw_trans_data_sorted = raw_trans_data.sort_values(by=['customer', 'step']).reset_index(drop=True)

In [9]:
raw_trans_data_sorted

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,30,'C1000148617','5','M','28007','M1888755466','28007','es_otherservices',143.87,0
1,38,'C1000148617','5','M','28007','M1741626453','28007','es_sportsandtoys',16.69,0
2,42,'C1000148617','5','M','28007','M1888755466','28007','es_otherservices',56.18,0
3,43,'C1000148617','5','M','28007','M840466850','28007','es_tech',14.74,0
4,44,'C1000148617','5','M','28007','M1823072687','28007','es_transportation',47.42,0
...,...,...,...,...,...,...,...,...,...,...
594638,174,'C999723254','2','M','28007','M1823072687','28007','es_transportation',31.94,0
594639,176,'C999723254','2','M','28007','M1823072687','28007','es_transportation',1.92,0
594640,177,'C999723254','2','M','28007','M85975013','28007','es_food',62.55,0
594641,178,'C999723254','2','M','28007','M1823072687','28007','es_transportation',25.96,0


# Dive deeper into the transaction 

### observation 1: one customer can make multiple transactions at one merchant 

In [10]:
#raw_trans_data_sorted.loc[(raw_trans_data_sorted.customer=="'C1093826151'")]
raw_trans_data_sorted.loc[(raw_trans_data_sorted.customer=="'C1093826151'")&(raw_trans_data_sorted.merchant=="'M348934600'")]

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
31014,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
31016,2,'C1093826151','4','M','28007','M348934600','28007','es_transportation',37.21,0
31017,3,'C1093826151','4','M','28007','M348934600','28007','es_transportation',31.63,0
31018,4,'C1093826151','4','M','28007','M348934600','28007','es_transportation',35.86,0
31020,6,'C1093826151','4','M','28007','M348934600','28007','es_transportation',39.58,0
...,...,...,...,...,...,...,...,...,...,...
31176,175,'C1093826151','4','M','28007','M348934600','28007','es_transportation',48.55,0
31177,176,'C1093826151','4','M','28007','M348934600','28007','es_transportation',0.87,0
31178,177,'C1093826151','4','M','28007','M348934600','28007','es_transportation',43.97,0
31179,178,'C1093826151','4','M','28007','M348934600','28007','es_transportation',29.35,0


In [11]:
raw_trans_data_sorted.loc[(raw_trans_data_sorted.customer=="'C1093826151'")&(raw_trans_data_sorted.merchant=="'M348934600'")&(raw_trans_data_sorted.fraud==1)]

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud


In [12]:
known_fraud = raw_trans_data_sorted.loc[raw_trans_data_sorted.fraud==1]
known_fraud

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
60,102,'C1000148617','5','M','28007','M480139044','28007','es_health',323.64,1
139,39,'C100045114','4','M','28007','M732195782','28007','es_travel',3902.93,1
145,87,'C100045114','4','M','28007','M2122776122','28007','es_home',52.11,1
146,87,'C100045114','4','M','28007','M1873032707','28007','es_hotelservices',39.86,1
188,137,'C100045114','4','M','28007','M1353266412','28007','es_hotelservices',960.66,1
...,...,...,...,...,...,...,...,...,...,...
594118,91,'C998690782','2','M','28007','M732195782','28007','es_travel',5527.37,1
594183,160,'C998690782','2','M','28007','M480139044','28007','es_health',29.67,1
594379,6,'C999393223','1','F','28007','M855959430','28007','es_hyper',10.77,1
594381,17,'C999393223','1','F','28007','M480139044','28007','es_health',476.43,1


In [13]:
known_fraud.shape

(7200, 10)

### observation 2: same (customer, merchant) pair can be flagged as fraud multiple times 

In [14]:
known_fraud[known_fraud.duplicated(subset=['customer', 'merchant'])] 

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
351,122,'C1001065306','1','M','28007','M480139044','28007','es_health',1024.36,1
361,144,'C1001065306','1','M','28007','M480139044','28007','es_health',18.18,1
362,153,'C1001065306','1','M','28007','M980657600','28007','es_sportsandtoys',270.77,1
363,155,'C1001065306','1','M','28007','M17379832','28007','es_sportsandtoys',230.36,1
1719,122,'C1007572087','2','F','28007','M732195782','28007','es_travel',7635.41,1
...,...,...,...,...,...,...,...,...,...,...
587205,82,'C974315171','3','M','28007','M980657600','28007','es_sportsandtoys',271.95,1
588725,147,'C980181294','4','F','28007','M732195782','28007','es_travel',2371.07,1
588744,159,'C980181294','4','F','28007','M480139044','28007','es_health',318.25,1
591583,141,'C989137613','4','F','28007','M732195782','28007','es_travel',5739.93,1


In [15]:
known_fraud.loc[(known_fraud.customer=="'C1001065306'")&(known_fraud.merchant=="'M980657600'")]

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
360,142,'C1001065306','1','M','28007','M980657600','28007','es_sportsandtoys',100.89,1
362,153,'C1001065306','1','M','28007','M980657600','28007','es_sportsandtoys',270.77,1


In [16]:
known_fraud.drop_duplicates(subset=['customer', 'merchant'])

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
60,102,'C1000148617','5','M','28007','M480139044','28007','es_health',323.64,1
139,39,'C100045114','4','M','28007','M732195782','28007','es_travel',3902.93,1
145,87,'C100045114','4','M','28007','M2122776122','28007','es_home',52.11,1
146,87,'C100045114','4','M','28007','M1873032707','28007','es_hotelservices',39.86,1
188,137,'C100045114','4','M','28007','M1353266412','28007','es_hotelservices',960.66,1
...,...,...,...,...,...,...,...,...,...,...
594118,91,'C998690782','2','M','28007','M732195782','28007','es_travel',5527.37,1
594183,160,'C998690782','2','M','28007','M480139044','28007','es_health',29.67,1
594379,6,'C999393223','1','F','28007','M855959430','28007','es_hyper',10.77,1
594381,17,'C999393223','1','F','28007','M480139044','28007','es_health',476.43,1


### observation 3: for same customer on same category purchase, the fraud flag can be different 

In [17]:
raw_trans_data_sorted.loc[(raw_trans_data_sorted.customer=="'C1000148617'")&(raw_trans_data_sorted.merchant=="'M480139044'")] 
#the reason this transaction is being flagged because of the categories change?

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
60,102,'C1000148617','5','M','28007','M480139044','28007','es_health',323.64,1


In [18]:
raw_trans_data_sorted.loc[(raw_trans_data_sorted.customer=="'C1000148617'")&(raw_trans_data_sorted.category=="'es_health'")]

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
60,102,'C1000148617','5','M','28007','M480139044','28007','es_health',323.64,1
124,173,'C1000148617','5','M','28007','M1053599405','28007','es_health',165.23,0


# Load in the raw network data 

In [19]:
raw_net_data_path = '../../data/01_raw/financial_fraud/bsNET140513_032310.csv'

In [20]:
raw_net_data = pd.read_csv(raw_net_data_path)

In [21]:
raw_net_data.shape

(594643, 5)

In [22]:
raw_net_data.columns

Index(['Source', 'Target', 'Weight', 'typeTrans', 'fraud'], dtype='object')

#### It is observed that, Source is the customer id, Target is the merchant id and Weight is the transaction amount

In [23]:
raw_net_data.head(10)

Unnamed: 0,Source,Target,Weight,typeTrans,fraud
0,'C1093826151','M348934600',4.55,'es_transportation',0
1,'C352968107','M348934600',39.68,'es_transportation',0
2,'C2054744914','M1823072687',26.89,'es_transportation',0
3,'C1760612790','M348934600',17.25,'es_transportation',0
4,'C757503768','M348934600',35.72,'es_transportation',0
5,'C1315400589','M348934600',25.81,'es_transportation',0
6,'C765155274','M348934600',9.1,'es_transportation',0
7,'C202531238','M348934600',21.17,'es_transportation',0
8,'C105845174','M348934600',32.4,'es_transportation',0
9,'C39858251','M348934600',35.4,'es_transportation',0


In [24]:
raw_net_data.loc[(raw_net_data.Source=="'C1093826151'")&(raw_net_data.Target=="'M348934600'")]

Unnamed: 0,Source,Target,Weight,typeTrans,fraud
0,'C1093826151','M348934600',4.55,'es_transportation',0
5076,'C1093826151','M348934600',37.21,'es_transportation',0
8664,'C1093826151','M348934600',31.63,'es_transportation',0
9936,'C1093826151','M348934600',35.86,'es_transportation',0
15042,'C1093826151','M348934600',39.58,'es_transportation',0
...,...,...,...,...,...
577785,'C1093826151','M348934600',48.55,'es_transportation',0
579805,'C1093826151','M348934600',0.87,'es_transportation',0
584013,'C1093826151','M348934600',43.97,'es_transportation',0
590115,'C1093826151','M348934600',29.35,'es_transportation',0


In [25]:
raw_trans_data_sorted.loc[
    (raw_trans_data_sorted.customer=="'C1093826151'")
    &(raw_trans_data_sorted.merchant=="'M348934600'")
]

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
31014,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
31016,2,'C1093826151','4','M','28007','M348934600','28007','es_transportation',37.21,0
31017,3,'C1093826151','4','M','28007','M348934600','28007','es_transportation',31.63,0
31018,4,'C1093826151','4','M','28007','M348934600','28007','es_transportation',35.86,0
31020,6,'C1093826151','4','M','28007','M348934600','28007','es_transportation',39.58,0
...,...,...,...,...,...,...,...,...,...,...
31176,175,'C1093826151','4','M','28007','M348934600','28007','es_transportation',48.55,0
31177,176,'C1093826151','4','M','28007','M348934600','28007','es_transportation',0.87,0
31178,177,'C1093826151','4','M','28007','M348934600','28007','es_transportation',43.97,0
31179,178,'C1093826151','4','M','28007','M348934600','28007','es_transportation',29.35,0


# References

Edgar Alonso Lopez-Rojas and Stefan Axelsson. 2014. BANKSIM: A BANK PAYMENTS SIMULATOR FOR FRAUD DETECTION RESEARCH.