# Import library and preprocess data

In [2]:
import pandas as pd

In [77]:
app_id = pd.read_excel('./AppID Merchant.xlsx')
app_id = app_id[['Merchant', 'App ID']]
app_id.columns = ['merchant', 'appid']

# format to snake case
app_id['merchant'] = app_id['merchant'].apply(lambda x: '_'.join([c for c in x.replace(',', '').lower().split(' ') if c.isalnum()]))
app_id.head()

Unnamed: 0,merchant,appid
0,app_test,3
1,app_test,742
2,app_test,10033
3,app_test,10034
4,business_service,26


In [78]:
columns = ['deviceID', 'transID', 'transType', 'appid', 'pmcID', 'amount', 'userID', 'osVer', 'deviceModel', 'userIP', 'reqDate', 'longitude', 'latitude', 'campaignID', 'type_transaction']
tpe = pd.read_csv('./tpe_device_score.csv', names=columns)
tpe.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment


In [79]:
joined_df = tpe.join(app_id.set_index('appid'), how='inner', on='appid')
joined_df.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction,merchant
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment,money_transfer
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment,money_transfer
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment,money_transfer
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment,money_transfer
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment,money_transfer


In [92]:
# Check nan
joined_df.isna().count()

deviceID            2164765
transID             2164765
transType           2164765
appid               2164765
pmcID               2164765
amount              2164765
userID              2164765
osVer               2164765
deviceModel         2164765
userIP              2164765
reqDate             2164765
longitude           2164765
latitude            2164765
campaignID          2164765
type_transaction    2164765
merchant            2164765
dtype: int64

In [91]:
# Check duplicated
joined_df.duplicated().value_counts()

False    2164765
dtype: int64

In [80]:
joined_df[joined_df['deviceID'] == '000195863904dc8a'].groupby(['userID', 'merchant'])['appid'].count()

userID           merchant      
200119000040347  digital_goods      2
                 money_transfer    16
Name: appid, dtype: int64

# Connect ArangoDB

In [41]:
from pyArango.connection import Connection

In [43]:
conn = Connection(username='root', password='ngph')
db = conn['final']
merchant = db['merchant']
device_merchant = db['device_merchant']

# ArangoDB Insertion

## Add Merchant

In [81]:
merchant_unique = app_id['merchant'].unique()
merchant_unique

array(['app_test', 'business_service', 'clothing', 'digital_goods',
       'electronic_store', 'entertainment', 'food_and_beverage', 'game',
       'lodging_hotels_accommodations', 'marketplace', 'money_transfer',
       'promotion', 'retail_outlet', 'supermarket_convenience_store',
       'transportation', 'travel', 'utility'], dtype=object)

In [82]:
for m in merchant_unique:
    merchant.createDocument({
        '_key': m.strip()
    }).save()

## Add Device Merchant TPE

In [None]:
cols_excluded = set(joined_df.columns) - set(['deviceID', 'merchant'])
for _, row in joined_df.iterrows():
    new_dm = device_merchant.createDocument({
        '_from': 'devices/' + str(row['deviceID']),
        '_to': 'merchant/' + str(row['merchant']),
    })
    for c in cols_excluded:
        new_dm[c] = str(row[c])
    new_dm.save()

In [96]:
joined_df.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction,merchant
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment,money_transfer
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment,money_transfer
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment,money_transfer
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment,money_transfer
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment,money_transfer
