In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.preprocessing import LabelEncoder

## data loading for sequential embedding

In [2]:
%%time
sequential_data = []

with open('D:/DeepLearningProjectDATA/dataForSequentialEmbedding/data_train.json', 'r') as f:
    for line in f:
        sequential_data.append(json.loads(line))

Wall time: 22.8 s


- the unique key for each sample is the combination of userid and application time
    - applicants might apply more than one time (same userid, differenty application time)

In [3]:
%%time
sequential_driver = {} 

sequential_behavior = {}

for item in sequential_data:
    
    user_id = item[0]
    
    application_time = int(item[1]['order_info']['order_time'])
    
    sequential_driver.update({f"{user_id}|{application_time}" : item[1]['order_info']})
    
    sub_data = [x for x in item[1]['data'] if x['petime']<=application_time-100]
    # we only keep data occurs before application time. "-100" is not neccessary for offline data cleaning.
    ## but sometimes we use this trick for online calculation to avoid network slowdown
    
    sequential_behavior.update({f"{user_id}|{application_time}":sub_data})

Wall time: 2.61 s


### driver understanding

In [4]:
driver = pd.DataFrame(sequential_driver).T.reset_index()
driver['user_id'] = driver['index'].apply(lambda x : x.split('|')[0])
driver['application_time'] = driver['index'].apply(lambda x : x.split('|')[1])

In [5]:
driver['new_client'].value_counts()

# we normally separate new client (0) and old client (1) because they have different data pattern and data density

0.0    121429
1.0     78571
Name: new_client, dtype: int64

In [6]:
# converting unix time to real time

driver['application_date'] = pd.to_datetime(driver['order_time'],unit='ms')

In [7]:
driver.head()

Unnamed: 0,index,label,new_client,order_time,overdue,user_id,application_time,application_date
0,56f889ee11df4a72955147cb2f29a638|1509322980000,0.0,0.0,1509323000000.0,0.0,56f889ee11df4a72955147cb2f29a638,1509322980000,2017-10-30 00:23:00
1,82ba63c78d5543b7b2fd1b44412ea954|1507609140000,0.0,0.0,1507609000000.0,1.0,82ba63c78d5543b7b2fd1b44412ea954,1507609140000,2017-10-10 04:19:00
2,d84540c274cc43b894997f633fcf47b9|1509373080000,0.0,0.0,1509373000000.0,1.0,d84540c274cc43b894997f633fcf47b9,1509373080000,2017-10-30 14:18:00
3,b8206ff0ea1f4cf4abda18d0e0145497|1507529520000,0.0,1.0,1507530000000.0,0.0,b8206ff0ea1f4cf4abda18d0e0145497,1507529520000,2017-10-09 06:12:00
4,6e5f6b151edd4d40b51ca3b75e392f8d|1506805920000,0.0,1.0,1506806000000.0,0.0,6e5f6b151edd4d40b51ca3b75e392f8d,1506805920000,2017-09-30 21:12:00


In [8]:
driver['label'].value_counts()

0.0    183986
1.0     16014
Name: label, dtype: int64

### data understanding

In [17]:
%%time
# converting data into dataframe 
## this is not neccessary during modeling phase cause we can use data_generator (presented in data usage)

behavior = []

for user_keys in sequential_behavior:
    
    user_id, application_time = user_keys.split("|")
    
    for item in sequential_behavior[user_keys]:
        
        subitem = item.copy()
        subitem.update({"user_id":user_id, "application_time":application_time})
        behavior.append(subitem)

Wall time: 4.06 s


In [18]:
behavior = pd.DataFrame(behavior)

In [19]:
behavior.shape, behavior.user_id.nunique()

((5521272, 7), 117717)

In [20]:
len([x.split('|')[0] for x in sequential_behavior if len(sequential_behavior[x])==0])

# there might be several application without any behavior data due to many reasons (which can be ignored)
# data inclusion logic for keys with empty data will be presented in data usage

50077

In [21]:
behavior = behavior.sort_values(['user_id', 'application_time', 'petime'])

In [22]:
behavior['pname'].value_counts()

loan_index         2613003
personal_info       626540
id_verify           580250
contacts_info       452041
loan_submission     340394
operator            297867
bind_debit_card     253192
biometric_auto      168361
login               161428
register             22890
biometric_auth        5306
Name: pname, dtype: int64

### page view path overview

In [24]:
page_name = set(behavior['pname'].value_counts().index)

In [25]:
page_trannsition_overview = {
    f"from_{x}" : {
        f"to_{y}": 0 for y in page_name
    }
    for x in page_name
}

In [26]:
%%time
for keys in sequential_behavior:
    
    subdata = sequential_behavior[keys]
    subdata.sort(key=lambda x : x['petime'])
    
    if len(subdata)<2:
        pass
    
    else:

        prev_state = subdata[0]['pname']

        for item in subdata[1:]:
            current_state = item['pname']

            page_trannsition_overview[f"from_{prev_state}"][f"to_{current_state}"] += 1

            prev_state = item['pname']

Wall time: 4.37 s


In [27]:
pd.DataFrame(page_trannsition_overview)

Unnamed: 0,from_biometric_auth,from_loan_index,from_operator,from_bind_debit_card,from_register,from_contacts_info,from_id_verify,from_loan_submission,from_login,from_biometric_auto,from_personal_info
to_bind_debit_card,8,114913,688,133870,1,14,7,3660,11,0,3
to_biometric_auth,2368,139,1,1,1,5,2516,0,41,0,6
to_biometric_auto,0,28002,11,0,0,5,93522,0,2,46804,6
to_contacts_info,118,10193,17,3,0,345553,40,0,3,2,96098
to_id_verify,119,169595,38,9,8,20,408239,21,406,29,15
to_loan_index,210,1782391,162697,115315,22732,16030,69185,236433,155899,28680,2208
to_loan_submission,3,262439,475,2304,1,9,26,74852,86,0,11
to_login,7,47011,531,210,54,335,1385,5536,2603,310,68
to_operator,6,76096,131569,51,0,89757,11,49,95,5,18
to_personal_info,2461,2341,3,0,2,19,1488,4,35,92001,528004


## data usage

In [196]:
le = LabelEncoder()
le.fit([1,2,3,4])

LabelEncoder()

In [197]:
le.fit_transform([1])

array([0], dtype=int64)

In [534]:
le = LabelEncoder()
le.fit(['bind_credit_card',
 'bind_debit_card',
 'biometric_auto',
        'biometric_auth',
 'contacts_info',
 'id_verify',
 'loan_index',
 'loan_submission',
 'login',
 'operator',
 'personal_info',
 'register',
 'unknown'])

LabelEncoder()

In [200]:
## here's the logic for which data should be dropped entirely and which data should be included as missing valued data

user_id_with_data = set([x.split("|")[0] for x in sequential_behavior if len(sequential_behavior[x])!=0])

user_id_without_data = set([x.split("|")[0] for x in sequential_behavior if len(sequential_behavior[x])==0])

user_id_without_data_should_keep = [x for x in user_id_without_data if x in user_id_with_data]



In [201]:
final_user_id = list(user_id_with_data) + list(set(user_id_without_data_should_keep))

In [205]:
pstart = [x['pstime'] for x in a]
    
pend = ([x['petime'] for x in a])

In [402]:
def data_process(sequence_for_a_single_application):
    
    check=lambda x,y: x[y] if y in x else "NAN"
    
#     columns=sequence_for_a_single_application.columns
    
    sequence_for_a_single_application.sort(key=lambda x : x['petime'])
    
    page_sequence = [check(x,'pname') for x in sequence_for_a_single_application]
    
    pstart = [check(x,'pstime') for x in sequence_for_a_single_application]
    
    pend = ([check(x,'petime') for x in sequence_for_a_single_application])
    
    page_stay_time = [(y-x)/1000 for x,y in zip(pstart, pend)]
    
    page_lagg_time = [(x-y)/1000 if (x-y)//1000<600 else -1 for x,y in zip(pstart[1:], pend[:-1])] 
    # calculate the duration between the end of last action and the start of current action
    # if this lag is more than 10 minutes we ignore the quantitative meaning of this value
    
    page_sequence = le.transform(page_sequence)
    
    
    pid = [check(x,'pid') for x in sequence_for_a_single_application]
    sid = [check(x,'sid') for x in sequence_for_a_single_application]
    
    return page_sequence, page_stay_time, page_lagg_time, pid, sid


# 这里的是分钟。  大于10分钟的全部ignore. 
# 那么follow一样的套路。每组前面加上一串序列：[-1,0,]
# 此外，这里是按秒来计算的...

In [407]:
# data generator body

def file_generator(data):
    '''
    create a generator for files
    '''
    i = -1
    while True:
        i+=1
        if i<len(data):
            yield list(set(data[i].keys()))
        else:
            i=-1

def data_generator(original_data,ydata_driver, batch_size = 32):
    if type(original_data)!=list:
        file_gen = file_generator([original_data])
    else:
        file_gen = file_generator(original_data)
    data_keys = next(file_gen)
    
    y_=ydata_driver.loc[data_keys,"overdue"].values.tolist()
    
    while True:
        total_batch = len(data_keys)//batch_size
        if total_batch < len(data_keys)/batch_size:
            total_batch+=1
        n_batch = 0
        while n_batch<total_batch:
            b_sequence = []
            p_staytime_sequence = []
            p_lag_sequence = []
            pid_sequence=[]
            sid_sequence=[]
            start = n_batch*batch_size
            end = (n_batch+1)*batch_size
            for keys in data_keys[start:end]:
                page_sequence, page_stay_time, page_lagg_time,pid,sid = data_process(original_data[keys])
                page_sequence=[int(i_) for i_ in page_sequence]
                b_sequence.append(page_sequence)
                p_staytime_sequence.append(page_stay_time)
                p_lag_sequence.append([0,]+page_lagg_time)
                pid_sequence.append(pid)
                sid_sequence.append(sid)
                y_value=y_[start:end]
            n_batch +=1
            yield b_sequence,p_staytime_sequence,p_lag_sequence,pid_sequence,sid_sequence,y_value
        data_keys = next(file_gen)

so in total, if there is 62 sequence, then will cover 90% percent of the data.
so we will use 60 sequence, as the final total sequence we will use.

如果说，没有足够60步，那么就用某个东西去填充他。如果超过60步，选取交易完成前的60步作为数据


    translate: if the total sequence is less than 60, we use something before if to do the padding

时间：-1代表超时，-10代表忽略项目

    time: for lag time: we use -1 as lag time too long and -10 as used to do the padding

page: -1 代表忽略项目（即填充的东西）
    
    for page type: use -1 to do the padding

mtf: 使用： uniform，完全一致。（13个class + 一个 -1 代表填充物）
时间上： uniform，完全一致。0-10分钟，每5秒钟记录一次 or 


    for MTF: there are types of parameter you can use to do the bins. so you might can have a look at it.


I would say, simply use GAF to do the gragh: I do not know how to select a session for the time?  but GAF should be a good choice.

for the GAF: adding a signal for time? how long it has spend? 



for RNN: 
simply use those sequence data, and we can put the pid and sid into it. as a sequence. we can just ignore this part for a second.




# this part is for data cleaning and output


In [410]:
import numpy as np

In [411]:
a=np.array([1])

In [412]:
np.concatenate([a,a])

array([1, 1])

In [449]:
N=20000
storestep=2000
path="D:/DeepLearningProjectDATA.json"

In [536]:
data_ = data_generator(sequential_behavior,driver, batch_size = 1) # iteratively read and process data from raw sequential_behavior
# batch_size controls how much data it process each iteration

In [538]:
array_data=[]
for i_ in range(N):
    temp={}
    b_sequence,p_staytime_sequence,p_lag_sequence,pid_sequence,sid_sequence,y_value=[i[0] for i in next(data_)]
    if len(b_sequence)<=0:
        continue
    store=(b_sequence,p_staytime_sequence,p_lag_sequence,pid_sequence,sid_sequence)
    re_data=[i[-60:] for i in store]
    
    if len(b_sequence)>=60:    
        temp["y"]=y_value
        temp["data"]=re_data
        
    else:
        to_pad=60-len(b_sequence)
        padding=[[-1]*to_pad,[-1]*to_pad,[-10]*to_pad,["ignore"]*to_pad,["ignore"]*to_pad]
        re_data=[padding[i]+re_data[i] for i in range(5)]
        
        temp["y"]=y_value
        temp["data"]=re_data
    array_data.append(temp)
    
#     if i_%storestep==0:
#         with open(path,"w") as file:
#             file.write(json.dumps(array_data))
#         print("store complete")
        

with open(path,"w") as file:
    file.write(json.dumps(array_data))
        



'''
for pagetype: use -1 to pad
for pagestay time: use -1 to pad
for page lag time: use -10 \ in the previous ,-1 is used to present long lags
for sid: "ignore"
for pid: "ignore"

use Json to store the data

'''
        
        
    
    

'\nfor pagetype: use -1 to pad\nfor pagestay time: use -1 to pad\nfor page lag time: use -10 \\ in the previous ,-1 is used to present long lags\nfor sid: "ignore"\nfor pid: "ignore"\n\nuse Json to store the data\n\n'

# Data cleaning end here

In [543]:
import json
from sklearn.preprocessing import LabelEncoder
import numpy as np
%%time


with open(path, 'r') as f:
    for line in f.readlines():
        print("a")
        sequential_data_clean.append(json.loads(line))

a
Wall time: 3.28 s


In [552]:
def get_xy(sequential_data_clean):
    y=[]
    x_list=[np.array([0]*60)[np.newaxis,:]]*5
    le = LabelEncoder()
    for user in sequential_data_clean:
        y.append(user['y'])
    #     we will need to transform the sid and pid into label encoder
        for i in range(3):
            x_list[i]=np.concatenate([x_list[i],np.array(user['data'][i])[np.newaxis,:]],axis=0)
        
        x_list[3]=np.concatenate([x_list[3],le.fit_transform(user['data'][3])[np.newaxis,:]])
        x_list[4]=np.concatenate([x_list[4],le.fit_transform(user['data'][4])[np.newaxis,:]])
        
    return x_list,y
    
    
    
    

In [554]:
xk,yk=get_xy(sequential_data_clean[0])

In [555]:
[i.shape for i in xk]

[(14889, 60), (14889, 60), (14889, 60), (14889, 60), (14889, 60)]