<h1 align="center">Intuition</h1>
<h2 align="center">Bruno Gonçalves</h2>
<h4 align="center">bgoncalves@gmail.com</h4>
<h4 align="center">@bgoncalves</h4>

In [1]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

Berka PKDD99 dataset: https://sorry.vse.cz/~berka/challenge/pkdd1999/berka.htm

In [2]:
data = pd.read_csv('data/trans.asc.gz', sep=';', dtype='str')

# Define functions to generate hashes and check that they are correct

In [3]:
def hashrow(data):
    fields = set(["trans_id","account_id", "date", "type", "operation", "amount","balance","k_symbol","bank", "account", "previous"])
    text = ";".join([str(v) for k, v in data.iteritems() if k != 'hash' and k!='correct'])
    return str(hash(text))

In [4]:
def checkhash(data):
    curr_hash = hashrow(data)
    
    result = (curr_hash == data['hash'])
    return result

In [5]:
data.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.0,,,
2,207264,704,930101,PRIJEM,VKLAD,1000.0,1000.0,,,
3,1117247,3818,930101,PRIJEM,VKLAD,600.0,600.0,,,
4,579373,1972,930102,PRIJEM,VKLAD,400.0,400.0,,,


# Calculate the original hashes

In [6]:
data['hash'] = data.progress_apply(hashrow, axis=1)
data['correct'] = data.progress_apply(checkhash, axis=1)

100%|██████████| 99999/99999 [00:03<00:00, 26620.95it/s]
100%|██████████| 99999/99999 [00:04<00:00, 20020.58it/s]


# Check that they are all correct

In [7]:
data[data['correct'] == False]

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,hash,correct


# Let's change one transaction and see if we can detect it

In [8]:
data.loc[data['trans_id'] == '207264', 'balance'] = 'FAKE VALUE'
data['correct'] = data.progress_apply(checkhash, axis=1)

100%|██████████| 99999/99999 [00:04<00:00, 20246.91it/s]


In [9]:
data[data['correct'] == False]

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,hash,correct
2,207264,704,930101,PRIJEM,VKLAD,1000.0,FAKE VALUE,,,,4288910985472910428,False


# Easily fixed

In [10]:
data.loc[data['correct'] == False, 'hash'] = data[data['correct'] == False].apply(hashrow, axis=1)
data.loc[data['correct'] == False, 'correct'] = data[data['correct'] == False].apply(checkhash, axis=1)

In [11]:
data[data['correct'] == False]

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,hash,correct


# Let's make a linked list

In [12]:
current_hash = 'Origin'
data_size = 10000

small_data = data[:data_size].copy()

for i in tqdm(range(data_size), total=data_size):
    small_data.loc[i, 'previous'] = current_hash
    current_hash = hashrow(small_data.iloc[i])
    small_data.loc[i, 'hash'] = current_hash

100%|██████████| 10000/10000 [00:15<00:00, 657.47it/s]


In [13]:
small_data['correct'] = small_data.progress_apply(checkhash, axis=1)

100%|██████████| 10000/10000 [00:00<00:00, 19101.17it/s]


In [14]:
small_data[small_data['correct'] == False]

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,hash,correct,previous


In [15]:
small_data.loc[small_data['trans_id'] == '207264', 'balance'] = '99999'

In [16]:
small_data.loc[small_data['trans_id'] == '207264', 'hash'] = hashrow(small_data.loc[small_data['trans_id'] == '207264'])
small_data['correct'] = small_data.progress_apply(checkhash, axis=1)

100%|██████████| 10000/10000 [00:00<00:00, 19369.23it/s]


In [17]:
small_data.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,hash,correct,previous
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.0,,,,5853799050689451404,True,Origin
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.0,,,,8230732003580935985,True,5853799050689451404
2,207264,704,930101,PRIJEM,VKLAD,1000.0,99999.0,,,,-3334579821466902824,False,8230732003580935985
3,1117247,3818,930101,PRIJEM,VKLAD,600.0,600.0,,,,-2268462960127567160,True,6675887716061399419
4,579373,1972,930102,PRIJEM,VKLAD,400.0,400.0,,,,-6619880664608596286,True,-2268462960127567160


In [18]:
small_data[small_data['trans_id'] == '207264'].apply(checkhash, axis=1)

2    False
dtype: bool

In [19]:
hashrow(small_data[small_data['trans_id'] == '207264'])

'-3334579821466902824'

In [20]:
checkhash(small_data[small_data['trans_id'] == '207264'])

2    True
Name: hash, dtype: bool

In [21]:
data.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,hash,correct
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.00,,,,-2689460077359345356,True
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.00,,,,3509477280261167456,True
2,207264,704,930101,PRIJEM,VKLAD,1000.0,FAKE VALUE,,,,9206719648691585510,True
3,1117247,3818,930101,PRIJEM,VKLAD,600.0,600.00,,,,7867024256693933512,True
4,579373,1972,930102,PRIJEM,VKLAD,400.0,400.00,,,,-5919845686611638368,True


In [22]:
def temp(data):
    print(data)
    return 10

In [23]:
data.loc[:10, 'ammount'].rolling(5, axis=0, min_periods=3).apply(temp)

KeyError: 'the label [ammount] is not in the [columns]'