# Notebook for data split and transform for time series model 
## This notebook consists of steps to 
1. process raw data into independent time series for time series model 
2. train/test data split 

In [6]:
import pandas as pd
import numpy as np

# Load raw data

In [7]:
raw_data_path = '../../data/01_raw/financial_fraud/bs140513_032310.csv'

In [8]:
raw_trans_data = pd.read_csv(raw_data_path)

In [9]:
raw_trans_data.shape

(594643, 10)

In [17]:
raw_trans_data.head(10)

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
5,0,'C1315400589','3','F','28007','M348934600','28007','es_transportation',25.81,0
6,0,'C765155274','1','F','28007','M348934600','28007','es_transportation',9.1,0
7,0,'C202531238','4','F','28007','M348934600','28007','es_transportation',21.17,0
8,0,'C105845174','3','M','28007','M348934600','28007','es_transportation',32.4,0
9,0,'C39858251','5','F','28007','M348934600','28007','es_transportation',35.4,0


# Save (customer_id, merchant_id) that had more than 10 entries into time series 

In [10]:
raw_trans_data.shape

(594643, 10)

In [11]:
customer_merchant_trans_count = raw_trans_data.groupby(by=['customer','merchant']).agg({'step':'count'})
customer_merchant_trans_count

Unnamed: 0_level_0,Unnamed: 1_level_0,step
customer,merchant,Unnamed: 2_level_1
'C1000148617','M1053599405',1
'C1000148617','M1400236507',1
'C1000148617','M1741626453',1
'C1000148617','M1823072687',92
'C1000148617','M1842530320',1
...,...,...
'C999723254','M348934600',27
'C999723254','M349281107',1
'C999723254','M480139044',1
'C999723254','M855959430',2


In [12]:
ts_count_threshold = 10

In [13]:
customer_merchant_trans_more_than_threshold =customer_merchant_trans_count.loc[customer_merchant_trans_count.step>ts_count_threshold].reset_index()

In [14]:
customer_merchant_trans_more_than_threshold

Unnamed: 0,customer,merchant,step
0,'C1000148617','M1823072687',92
1,'C1000148617','M348934600',22
2,'C100045114','M1823072687',27
3,'C100045114','M348934600',38
4,'C1000699316','M1823072687',52
...,...,...,...
7853,'C998987490','M85975013',13
7854,'C999393223','M1823072687',19
7855,'C999393223','M348934600',90
7856,'C999723254','M1823072687',77


In [15]:
customer_merchant_pairs_for_ts = np.array(customer_merchant_trans_more_than_threshold[['customer','merchant']])

In [16]:
customer_merchant_pairs_for_ts, customer_merchant_pairs_for_ts.shape

(array([["'C1000148617'", "'M1823072687'"],
        ["'C1000148617'", "'M348934600'"],
        ["'C100045114'", "'M1823072687'"],
        ...,
        ["'C999393223'", "'M348934600'"],
        ["'C999723254'", "'M1823072687'"],
        ["'C999723254'", "'M348934600'"]], dtype=object),
 (7858, 2))

In [22]:
for i, c_m_p in enumerate(customer_merchant_pairs_for_ts):
    if i>1: #only see two examples
        break
    c_m_p_data = raw_trans_data.loc[(raw_trans_data.customer==c_m_p[0])&( raw_trans_data.merchant==c_m_p[1])][['step','amount']]
    print(c_m_p, c_m_p_data)
    c_m_p_data.rename(columns={"step": "a", "amount": "c"})
    

["'C1000148617'" "'M1823072687'"]         step  amount
124902    44   47.42
127559    45    1.71
130271    46   34.79
132312    47   55.14
138642    49   13.23
...      ...     ...
577349   175   21.80
581448   176   23.40
586068   177   50.30
587443   178   11.49
593417   179   25.99

[92 rows x 2 columns]
["'C1000148617'" "'M348934600'"]         step  amount
388527   123   45.68
392287   124   13.54
395960   125   11.80
398924   126   40.29
403961   128   37.36
413546   130   45.23
420036   132   12.81
421716   133   51.67
431483   135   25.19
435804   136   63.32
439082   137   23.92
442414   138    8.20
444167   139    9.33
447859   140   13.00
451032   141    4.31
454417   142   18.09
464234   144   19.90
465108   145   54.61
471864   146    8.77
474060   147   38.05
490250   151   55.01
527971   162   36.83
