# 14 - Kaggle Competition
# Fraud Detection


## https://inclass.kaggle.com/c/easy-ml-class

by [Alejandro Correa Bahnsen](albahnsen.com/)

version 0.1, May 2016

## Part of the class [Machine Learning for Security Informatics](https://github.com/albahnsen/ML_SecurityInformatics)


This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License]

## Fraud Detection

In [62]:
import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/fraud_transactions_kaggle.csv.zip', 'r') as z:
    f = z.open('fraud_transactions_kaggle.csv')
    data = pd.read_csv(f, index_col=0)

In [6]:
data.head()

Unnamed: 0_level_0,date,card_number,type,merchant,amount,fraud
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2011-01-01 08:00:06,1942,2,8328,65.16,0.0
1,2011-01-01 08:00:16,5629,2,42588,260.84,0.0
2,2011-01-01 08:01:28,408,2,15622,6010.05,0.0
3,2011-01-01 08:01:43,859,2,45192,348.46,0.0
4,2011-01-01 08:01:48,3786,2,35549,1160.35,0.0


In [7]:
data.tail()

Unnamed: 0_level_0,date,card_number,type,merchant,amount,fraud
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
199995,2012-12-31 17:04:18,4069,2,35828,91.22,
199996,2012-12-31 17:04:51,9,2,46923,390.95,
199997,2012-12-31 17:05:38,1481,1,-1,0.65,
199998,2012-12-31 17:05:55,1481,1,4535,390.04,
199999,2012-12-31 17:25:02,0,1,8322,308.44,


In [8]:
data.fraud.value_counts(dropna=False)

 0.0    171048
NaN      27909
 1.0      1043
Name: fraud, dtype: int64

### Estimate aggregated features

In [63]:
from datetime import datetime, timedelta
from tqdm import tqdm

Split for each account and create the date as index

In [64]:
card_numbers = data['card_number'].unique()
data['trx_id'] = data.index
data.index = pd.DatetimeIndex(data['date'])

data_ = []
for card_number in tqdm(card_numbers):
    data_.append(data.query('card_number == ' + str(card_number)))

100%|██████████| 8087/8087 [00:24<00:00, 326.43it/s]


Create Aggregated Features for one account


In [72]:
res_agg = pd.DataFrame(index=data['trx_id'].values, 
                       columns=['Trx_sum_7D', 'Trx_count_1D'])

In [73]:
trx = data_[0]

for i in range(trx.shape[0]):
    date = trx.index[i]
    trx_id = int(trx.ix[i, 'trx_id'])
    # Sum 7 D
    agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
    res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
    # Count 1D
    agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
    res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]

In [74]:
res_agg.mean()

Trx_sum_7D      1054.881429
Trx_count_1D       0.640693
dtype: float64

All accounts

In [75]:
for trx in tqdm(data_):
    for i in range(trx.shape[0]):
        date = trx.index[i]
        trx_id = int(trx.ix[i, 'trx_id'])
        # Sum 7 D
        agg_ = trx[date-pd.datetools.to_offset('7D').delta:date-timedelta(0,0,1)]
        res_agg.loc[trx_id, 'Trx_sum_7D'] = agg_['amount'].sum()
        # Count 1D
        agg_ = trx[date-pd.datetools.to_offset('1D').delta:date-timedelta(0,0,1)]
        res_agg.loc[trx_id, 'Trx_count_1D'] = agg_['amount'].shape[0]

100%|██████████| 8087/8087 [05:04<00:00, 26.54it/s] 
