# Use D features to identify accountID 

Ideas from https://www.kaggle.com/akasyanama13/eda-what-s-behind-d-features/ and https://www.kaggle.com/c/ieee-fraud-detection/discussion/108704#latest-625718

## Prerequisites

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [51]:
import numpy as np
import pandas as pd

from src.dataset.data import Dataset
from src.features.build_features import *
from src.features.utils import convert_category_cols_lgb
from src.model.train import *
from src.visualization.visualize import *

import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = None
pd.options.display.max_columns = None

%matplotlib inline

In [4]:
ds = Dataset()
ds.load_dataset()

In [5]:
X = ds.X_train.reset_index()
y = ds.y_train.reset_index()['isFraud']
X_test = ds.X_test.reset_index()
X['isFraud'] = y

## Sandbox

In [37]:
by = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']
grouped = X.groupby(by, as_index=False)['TransactionID'].count()
grouped[grouped['TransactionID']==12].head(20)

Unnamed: 0,card1,card2,card3,card4,card5,card6,TransactionID
55,1064,407.0,185.0,visa,102.0,credit,12
79,1097,555.0,144.0,mastercard,137.0,credit,12
91,1111,310.0,150.0,mastercard,224.0,debit,12
106,1128,555.0,150.0,visa,226.0,debit,12
136,1175,555.0,150.0,visa,226.0,debit,12
187,1238,310.0,150.0,mastercard,224.0,debit,12
216,1269,555.0,150.0,visa,226.0,debit,12
228,1286,555.0,150.0,visa,226.0,debit,12
265,1329,399.0,150.0,american express,198.0,credit,12
269,1334,555.0,150.0,visa,226.0,debit,12


In [15]:
# This combination of cardx features gives 7 rows.
card1 = 18383
card2 = 128
card3 = 150
card4 = 'visa'
card5 = 226
card6 = 'credit'

X_slice = X[(X['card1']==card1)&
                   (X['card2']==card2)&
                   (X['card3']==card3)&
                   (X['card4']==card4)&
                   (X['card5']==card5)&
                   (X['card6']==card6)]

Now we can add "DaysFromStart" column by divining TransactionDT on 606024 and then round it to get a number of days from a starting point.

* D3 indicates number of days from the previous transaction.
* D1 could indicate days from the first transaction.

Can we relax the group by ?

In [32]:
def slice(X, card1, card2, card3, card5):
    X_slice = X[(X['card1']==card1)&
                   (X['card2']==card2)&
                   (X['card3']==card3)&
                   (X['card5']==card5)]
    features = ['TransactionID','TransactionDT','ProductCD', 'P_emaildomain', 'R_emaildomain', 'addr1', 'addr2'
            , 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'isFraud']
    X_slice = X_slice.sort_values(['TransactionID'])[features]
    X_slice['DaysFromStart'] = np.round(X_slice['TransactionDT']/(60*60*24),0)
    X_slice['DaysFromPreviousTransaction'] = X_slice['DaysFromStart'].diff()
    return X_slice

In [33]:
slice(X, card1, card2, card3, card5)

Unnamed: 0,TransactionID,TransactionDT,ProductCD,P_emaildomain,R_emaildomain,addr1,addr2,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,isFraud,DaysFromStart,DaysFromPreviousTransaction
77340,3064340,1699373,W,gmail.com,,264.0,87.0,405.0,405.0,21.0,405.0,21.0,,,,,441.0,,,,,405.0,0,20.0,
90370,3077370,1887400,R,anonymous.com,anonymous.com,264.0,87.0,371.0,371.0,371.0,0.0,,371.0,371.0,,,371.0,,,,0.0,,0,22.0,2.0
162642,3149642,3444196,W,gmail.com,,264.0,87.0,425.0,425.0,20.0,425.0,20.0,,,,,461.0,,,,,425.0,0,40.0,18.0
232040,3219040,5504516,W,gmail.com,,264.0,87.0,449.0,449.0,24.0,449.0,24.0,,,,,485.0,,,,,449.0,0,64.0,24.0
336013,3323013,8275288,W,gmail.com,,264.0,87.0,481.0,481.0,32.0,481.0,32.0,,,,,517.0,0.0,,,,481.0,0,96.0,32.0
425671,3412671,10772608,W,gmail.com,,264.0,87.0,510.0,510.0,29.0,510.0,29.0,,,,,546.0,0.0,,,,510.0,0,125.0,29.0
511094,3498094,13378525,W,gmail.com,,264.0,87.0,540.0,540.0,30.0,540.0,30.0,,,,,576.0,59.0,,,,540.0,0,155.0,30.0


In [34]:
slice(X, 1189, 555, 150, 226)

Unnamed: 0,TransactionID,TransactionDT,ProductCD,P_emaildomain,R_emaildomain,addr1,addr2,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,isFraud,DaysFromStart,DaysFromPreviousTransaction
26390,3013390,674535,W,yahoo.com,,123.0,87.0,0.0,,,0.0,,,,,,0.0,0.0,,,,0.0,0,8.0,
139008,3126008,2838386,W,gmail.com,,123.0,87.0,487.0,487.0,35.0,378.0,35.0,,,,,487.0,,,,,487.0,0,33.0,25.0
231943,3218943,5503310,W,gmail.com,,123.0,87.0,518.0,518.0,31.0,518.0,433.0,,,,,518.0,487.0,,,,518.0,0,64.0,31.0
302276,3289276,7484610,W,gmail.com,,123.0,87.0,541.0,541.0,23.0,432.0,54.0,,,,,541.0,510.0,,,,541.0,0,87.0,23.0
418508,3405508,10593623,W,gmail.com,,123.0,87.0,577.0,577.0,36.0,0.0,,,,,,577.0,546.0,,,,577.0,0,123.0,36.0
508339,3495339,13307505,W,gmail.com,,123.0,87.0,609.0,609.0,31.0,499.0,67.0,,,,,608.0,577.0,,,,608.0,0,154.0,31.0
588122,3575122,15733056,W,gmail.com,,123.0,87.0,637.0,637.0,28.0,527.0,28.0,,,,,636.0,605.0,,,,636.0,0,182.0,28.0


In [35]:
slice(X, 1012, 479.0, 150.0, 162.0)

Unnamed: 0,TransactionID,TransactionDT,ProductCD,P_emaildomain,R_emaildomain,addr1,addr2,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,isFraud,DaysFromStart,DaysFromPreviousTransaction
21268,3008268,562205,H,yahoo.com,,143.0,87.0,0.0,,,,,,,12.5,0.5,,,,,,,0,7.0,
28198,3015198,702331,H,gmail.com,,143.0,87.0,0.0,,,,,,,123.125,0.125,,,,,,,0,8.0,1.0
82808,3069808,1784656,W,,,143.0,87.0,0.0,,,,,,,,,0.0,0.0,,,,0.0,0,21.0,13.0
91699,3078699,1900126,W,,,143.0,87.0,0.0,,,,,,,,,0.0,0.0,,,,0.0,0,22.0,1.0
111185,3098185,2166984,R,anonymous.com,anonymous.com,315.0,87.0,0.0,,,,,,,492.041656,0.041666,,,,,,,0,25.0,3.0
226586,3213586,5352558,W,icloud.com,,143.0,87.0,227.0,227.0,220.0,490.0,0.0,,,,,490.0,0.0,,,,490.0,0,62.0,37.0
421527,3408527,10682474,W,gmail.com,,315.0,87.0,0.0,,,0.0,,,,,,0.0,290.0,,,,0.0,0,124.0,62.0
537124,3524124,14154216,H,yahoo.com,yahoo.com,143.0,87.0,0.0,,,0.0,,,,,,0.0,,,,,,0,164.0,40.0


In [38]:
slice(X, 1064, 407, 185, 102)

Unnamed: 0,TransactionID,TransactionDT,ProductCD,P_emaildomain,R_emaildomain,addr1,addr2,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,isFraud,DaysFromStart,DaysFromPreviousTransaction
61043,3048043,1381682,C,hotmail.com,hotmail.com,,,0.0,,,0.0,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0,16.0,
85138,3072138,1805398,C,gmail.com,gmail.com,,,0.0,,,0.0,,0.0,,0.875,0.875,0.0,,0.0,0.0,0.0,0.0,0,21.0,5.0
125402,3112402,2486701,C,gmail.com,gmail.com,,,0.0,,,0.0,,0.0,,,,0.0,,0.0,,,187.0,1,29.0,8.0
125414,3112414,2486949,C,gmail.com,gmail.com,,,0.0,,,0.0,0.0,0.0,0.0,,,0.0,,0.0,,,187.0,1,29.0,0.0
263199,3250199,6352078,C,yahoo.com,yahoo.com,,,0.0,,,0.0,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0,74.0,45.0
266039,3253039,6450252,C,hotmail.com,hotmail.com,,,0.0,,,0.0,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0,75.0,1.0
291902,3278902,7186854,C,me.com,me.com,,,0.0,,,0.0,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0,83.0,8.0
291926,3278926,7187782,C,me.com,me.com,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166666,0.166666,0.0,,0.0,0.0,0.0,0.0,0,83.0,0.0
325873,3312873,8032055,C,yahoo.com,yahoo.com,,,0.0,,,14.0,0.0,14.0,0.0,,,0.0,,14.0,0.0,0.0,0.0,1,93.0,10.0
392840,3379840,9851069,C,hotmail.com,hotmail.com,,,0.0,,,0.0,,0.0,,429.0,0.0,0.0,,0.0,0.0,0.0,0.0,0,114.0,21.0
