In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("../data/fraudTrain.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [5]:
# global features

# time since last transaction
data["timedelta"] = data.groupby(by="cc_num", group_keys=True)["unix_time"].transform(
    lambda x: x - x.min()
)

# day of week
data["weekday"] = data.groupby(by="cc_num", group_keys=True)["unix_time"].transform(lambda x: (x % 60 * 60 * 24 * 7) // 7)

# total purchases per merchant type


In [6]:
def circular_mean(x, period):
    circ_dist = np.exp(1j * x / period)
    net = np.sum(circ_dist)

    return np.arctan2(net.imag, net.real)


def circular_var(x, period):
    # https://www.ebi.ac.uk/thornton-srv/software/PROCHECK/nmr_manual/man_cv.html
    circ_dist = np.exp(1j * x / period)
    net = np.sum(circ_dist)
    r = np.sqrt(net.imag**2 + net.real**2)

    return 1 - r

In [29]:
def splice(x, start, end):
    index = x.index
    values = x.values

    return pd.Series(values[start:end], index=index[start:end])


def window_aggregate(data, window=10):
    day = 60 * 60 * 24
    degrees = 180 / np.pi

    # average position
    avg_long = (
        data.groupby(by="cc_num", group_keys=True)["long"]
        .rolling(window=window, min_periods=0)
        .apply(circular_mean, kwargs={"period": degrees})
    )

    avg_long = pd.Series(avg_long.values, index=avg_long.index.get_level_values(1))

    data["avg_long"] = avg_long

    avg_lat = (
        data.groupby(by="cc_num", group_keys=True)["lat"]
        .rolling(window=window, min_periods=0)
        .apply(circular_mean, kwargs={"period": degrees})
    )
    # arctan2 returns -pi to pi
    # latitude is defined -pi / 2 to pi / 2
    # therefore we divide by two
    avg_lat = pd.Series(avg_lat.values / 2.0, index=avg_lat.index.get_level_values(1))

    data["avg_lat"] = avg_lat

    # position variance
    var_long = (
        data.groupby(by="cc_num", group_keys=True)["long"]
        .rolling(window=window, min_periods=0)
        .apply(circular_var, kwargs={"period": degrees})
    )

    var_long = pd.Series(var_long.values, index=var_long.index.get_level_values(1))

    data["var_long"] = var_long

    var_lat = (
        data.groupby(by="cc_num", group_keys=True)["lat"]
        .rolling(window=window, min_periods=0)
        .apply(circular_var, kwargs={"period": degrees})
    )

    var_lat = pd.Series(var_lat.values, index=var_lat.index.get_level_values(1))

    data["var_lat"] = var_lat

    # time average
    avg_time = (
        data.groupby(by="cc_num", group_keys=True)["unix_time"]
        .rolling(window=window, min_periods=0)
        .apply(circular_mean, kwargs={"period": day})
    )

    avg_time = pd.Series(avg_time.values, index=avg_time.index.get_level_values(1))

    data["avg_time"] = avg_time
    
    var_time = (
        data.groupby(by="cc_num", group_keys=True)["unix_time"]
        .rolling(window=window, min_periods=0)
        .apply(circular_var, kwargs={"period": day})
    )

    var_time = pd.Series(var_time.values, index=var_time.index.get_level_values(1))

    data["var_time"] = var_time
    
    avg_weekday = (
        data.groupby(by="cc_num", group_keys=True)["weekday"]
        .rolling(window=window, min_periods=0)
        .apply(circular_mean, kwargs={"period": 7.})
    )
    
    avg_weekday = pd.Series(avg_weekday.values, index=avg_weekday.index.get_level_values(1))

    data["avg_weekday"] = avg_weekday
    
    var_weekday = (
        data.groupby(by="cc_num", group_keys=True)["weekday"]
        .rolling(window=window, min_periods=0)
        .apply(circular_mean, kwargs={"period": 7.})
    )
    
    var_weekday = pd.Series(avg_weekday.values, index=avg_weekday.index.get_level_values(1))

    data["avg_weekday"] = avg_weekday

    return data

In [30]:
d = data.head(100000).copy()
window_aggregate(d, window=10)

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,unix_time,merch_lat,merch_long,is_fraud,timedelta,avg_long,avg_lat,var_long,var_lat,avg_time
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,1325376018,36.011293,-82.048315,0,0,-1.416825,0.314847,0.000000e+00,0.000000e+00,2.744874
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,1325376044,49.159047,-118.186462,0,0,-2.063162,0.426627,0.000000e+00,0.000000e+00,2.745174
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,1325376051,43.150704,-112.154481,0,0,-1.959342,0.368097,0.000000e+00,0.000000e+00,2.745255
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1325376076,47.034331,-112.561071,0,0,-1.956755,0.403438,0.000000e+00,1.110223e-16,2.745545
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,1325376186,38.674999,-78.632459,0,0,-1.386889,0.335284,1.110223e-16,0.000000e+00,2.746818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,2019-02-28 15:36:49,2475085306462014,"fraud_O'Reilly, Mohr and Purdy",home,95.14,John,Miller,M,153 Mccullough Springs Apt. 857,...,1330443409,44.718105,-95.843397,0,5049515,-1.662843,0.386048,-9.000000e+00,-9.000000e+00,-2.277376
99996,99996,2019-02-28 15:37:27,4005676619255478,fraud_Kub PLC,personal_care,8.75,William,Perry,M,458 Phillips Island Apt. 768,...,1330443447,29.931844,-90.610715,0,4993051,-1.586551,0.265805,-9.000000e+00,-9.000000e+00,2.852703
99997,99997,2019-02-28 15:37:34,3519232971341141,fraud_Schuppe-Schuppe,food_dining,34.20,Michael,Jones,M,754 Smith Isle,...,1330443454,41.076153,-80.506107,0,5054527,-1.413012,0.353194,-9.000000e+00,-9.000000e+00,-2.886300
99998,99998,2019-02-28 15:38:11,4040099974063068803,fraud_Rippin-VonRueden,health_fitness,73.11,Jeffrey,Lewis,M,24255 Bryan Square,...,1330443491,48.535070,-102.524262,0,4960670,-1.784425,0.421843,-9.000000e+00,-9.000000e+00,2.847662


In [9]:
# pass trx appended to previous trx