# Notebook for data split and transform for time series model 
## This notebook consists of steps to 
1. process raw data into independent time series for time series model 
2. train/test data split 

In [6]:
import pandas as pd
import numpy as np

# Load raw data

In [7]:
raw_data_path = '../../data/01_raw/financial_fraud/bs140513_032310.csv'

In [8]:
raw_trans_data = pd.read_csv(raw_data_path)

In [9]:
raw_trans_data.shape

(594643, 10)

In [17]:
raw_trans_data.head(10)

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0
5,0,'C1315400589','3','F','28007','M348934600','28007','es_transportation',25.81,0
6,0,'C765155274','1','F','28007','M348934600','28007','es_transportation',9.1,0
7,0,'C202531238','4','F','28007','M348934600','28007','es_transportation',21.17,0
8,0,'C105845174','3','M','28007','M348934600','28007','es_transportation',32.4,0
9,0,'C39858251','5','F','28007','M348934600','28007','es_transportation',35.4,0


# Save (customer_id, category) that had more than 10 entries into time series 

In [10]:
raw_trans_data.shape

(594643, 10)

In [42]:
customer_category_trans_count = raw_trans_data.groupby(by=['customer','category']).agg({'step':'count'})
customer_category_trans_count

Unnamed: 0_level_0,Unnamed: 1_level_0,step
customer,category,Unnamed: 2_level_1
'C1000148617','es_food',5
'C1000148617','es_health',2
'C1000148617','es_home',1
'C1000148617','es_hotelservices',1
'C1000148617','es_hyper',1
...,...,...
'C999723254','es_health',4
'C999723254','es_hyper',2
'C999723254','es_otherservices',1
'C999723254','es_transportation',104


In [43]:
ts_count_threshold = 10

In [44]:
customer_category_trans_more_than_threshold =customer_category_trans_count.loc[customer_category_trans_count.step>ts_count_threshold].reset_index()

In [45]:
customer_category_trans_more_than_threshold

Unnamed: 0,customer,category,step
0,'C1000148617','es_transportation',114
1,'C100045114','es_transportation',65
2,'C100045114','es_wellnessandbeauty',17
3,'C1000699316','es_transportation',80
4,'C1001065306','es_health',16
...,...,...,...
5211,'C998987490','es_health',11
5212,'C998987490','es_transportation',150
5213,'C999393223','es_transportation',109
5214,'C999393223','es_wellnessandbeauty',16


In [46]:
customer_category_pairs_for_ts = np.array(customer_merchant_trans_more_than_threshold[['customer','category']])

In [47]:
customer_category_pairs_for_ts, customer_merchant_pairs_for_ts.shape

(array([["'C1000148617'", "'es_transportation'"],
        ["'C100045114'", "'es_transportation'"],
        ["'C100045114'", "'es_wellnessandbeauty'"],
        ...,
        ["'C999393223'", "'es_transportation'"],
        ["'C999393223'", "'es_wellnessandbeauty'"],
        ["'C999723254'", "'es_transportation'"]], dtype=object),
 (5216, 2))

In [48]:
i = 0
for c_m_p in customer_category_pairs_for_ts:
    if i>1: #only see two examples
        break
    c_m_p_data = raw_trans_data.loc[(raw_trans_data.customer==c_m_p[0])&( raw_trans_data.category==c_m_p[1])][['step','amount','fraud']]
    if np.sum(c_m_p_data['fraud'])>0:
        print(c_m_p, c_m_p_data)
        i+=1


["'C100045114'" "'es_wellnessandbeauty'"]         step  amount  fraud
18554      7   33.98      0
190340    65  187.85      0
202388    69  130.24      0
329377   107   35.37      0
333710   108  137.28      0
333711   108   30.99      0
333712   108   87.45      0
358842   115  152.87      0
358843   115   93.60      0
358844   115    5.30      0
452715   141    5.53      0
452716   141   56.50      0
452717   141   88.57      0
452718   141  185.77      0
507112   156   30.31      0
510372   157   48.90      1
554679   169   61.93      0
["'C1001065306'" "'es_health'"]         step   amount  fraud
56918     21    37.10      0
84525     31   108.32      0
84526     31   188.94      0
100311    36   906.87      1
106319    38   146.25      0
181718    63    31.41      0
181719    63   177.82      0
181720    63   106.47      0
383480   122  1024.36      1
400723   127    80.72      1
406796   128    26.65      0
406797   128    65.77      0
423608   133   185.40      0
423609   133   1