# Import library and preprocess data

In [2]:
import pandas as pd
import json
from tqdm import tqdm

In [3]:
app_id = pd.read_excel('raw/AppID Merchant.xlsx')
app_id = app_id[['Merchant', 'App ID']]
app_id.columns = ['merchant', 'appid']

# format to snake case
# app_id['merchant'] = app_id['merchant'].apply(lambda x: '_'.join([c for c in x.replace(',', '').lower().split(' ') if c.isalnum()]))
app_id.head()

Unnamed: 0,merchant,appid
0,App Test,3
1,App Test,742
2,App Test,10033
3,App Test,10034
4,Business Service,26


In [4]:
columns = ['deviceID', 'transID', 'transType', 'appid', 'pmcID', 'amount', 'userID', 'osVer', 'deviceModel', 'userIP', 'reqDate', 'longitude', 'latitude', 'campaignID', 'type_transaction']
tpe = pd.read_csv('raw/tpe_device_score.csv', names=columns)
tpe.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment


In [5]:
joined_df = tpe.join(app_id.set_index('appid'), how='inner', on='appid')
joined_df.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction,merchant
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment,Money Transfer
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment,Money Transfer
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment,Money Transfer
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment,Money Transfer
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment,Money Transfer


In [6]:
# Check nan
joined_df.isna().count()

deviceID            2164765
transID             2164765
transType           2164765
appid               2164765
pmcID               2164765
amount              2164765
userID              2164765
osVer               2164765
deviceModel         2164765
userIP              2164765
reqDate             2164765
longitude           2164765
latitude            2164765
campaignID          2164765
type_transaction    2164765
merchant            2164765
dtype: int64

In [7]:
# Check duplicated
joined_df.duplicated().value_counts()

False    2164765
dtype: int64

In [8]:
joined_df[joined_df['deviceID'] == '000195863904dc8a'].groupby(['userID', 'merchant'])['appid'].count()

userID           merchant      
200119000040347  Digital Goods      2
                 Money Transfer    16
Name: appid, dtype: int64

# ArangoDB Insertion

## Add Device Merchant TPE

In [9]:
tpe = []
cols_excluded = set(joined_df.columns) - set(['deviceID', 'userID'])
with tqdm(total=len(joined_df)) as pbar:
    for _, row in joined_df.iterrows():
        new_dm = {
            '_key': str(row['transID']),
            '_to': 'devices/' + str(row['deviceID']),
            '_from': 'users/' + str(row['userID']),
            'type': 'transaction'
        }
        for c in cols_excluded:
            new_dm[c] = str(row[c])
        tpe.append(new_dm)
        pbar.update(1)

100%|██████████| 2164765/2164765 [05:44<00:00, 6285.66it/s]


In [10]:
with open('json/tpe.json', 'w') as f:
    json.dump(tpe, f)

In [11]:
# Convert json to jsonl
!jq -c ".[]" json/tpe.json > jsonl/tpe.jsonl

In [None]:
# arangoimport --server.database=final --collection users_devices --type jsonl --file "jsonl/tpe.jsonl" --progress true --threads 16

In [13]:
joined_df['merchant'].unique()

array(['Money Transfer', 'Digital Goods', 'Utility', 'Transportation',
       'Marketplace', 'Entertainment', 'Food and Beverage',
       'Retail Outlet', 'Supermarket & Convenience store',
       'Business Service', 'Travel', 'Electronic Store', 'Clothing',
       'Game', 'Lodging, Hotels & Accommodations', 'App Test'],
      dtype=object)

In [34]:
joined_df.groupby('deviceID').agg({'merchant': lambda x: len(set(x))}).sort_values('merchant')

Unnamed: 0_level_0,merchant
deviceID,Unnamed: 1_level_1
000081FB-CAE0-4DBE-8983-D54759750014,1
C16652ED-8950-4035-A4D5-241F48739561,1
5CAB69E8-804E-49E5-96C2-72B65AD5D587,1
C1666B77-C749-4303-9D86-7040AB26AEAC,1
5CAB04E3-859C-4A31-98A2-B3BE01FBB3C2,1
...,...
5587B913-6EFA-4746-8BCC-0FE3CCE62805,9
2D4AE78B-F09A-4315-B8C6-9CB747C09B93,9
AB33242F-4607-4E79-9EA0-4D1373E7927C,10
0E8FAB1A-53ED-4E1E-AB1D-D1065821ED07,11


In [25]:
df.groupby('hw_device_id').agg({'userId': lambda x: len(set(x))}).sort_values('userId')

['count']