# Preprocess Data

In [1]:
import pandas as pd
import json
from tqdm import tqdm

## Device Profile

In [2]:
android_df = pd.read_csv('raw/dataTest_device_profile_android.csv')
ios_df = pd.read_csv('raw/dataTest_device_profile.csv')
joint_df = pd.concat([android_df, ios_df])
joint_df['params'] = joint_df['params'].apply(lambda x: json.loads(x))
joint_df.head()

Unnamed: 0,timestamp,userId,params
0,1596244856,170425000002238,{'hw_released': 'Tue Jul 07 08:50:39 GMT+07:00...
1,1596244830,191113586003363,{'hw_released': 'Thu Jul 02 12:19:14 GMT+07:00...
2,1596244888,200413000001404,{'hw_released': 'Thu Dec 07 11:24:05 GMT+07:00...
3,1596244831,200428000037320,{'hw_released': 'Wed Apr 22 07:53:33 GMT+07:00...
4,1596244849,190309000002524,{'hw_released': 'Thu Mar 12 12:05:05 GMT+07:00...


In [3]:
formatted = pd.json_normalize(joint_df['params'])

In [4]:
joint_df.reset_index(drop=True, inplace=True)

formatted.reset_index(drop=True, inplace=True)

In [5]:
new_df = pd.concat([joint_df[['timestamp', 'userId']], formatted], axis=1)
new_df.drop_duplicates(inplace=True)
new_df.head()

Unnamed: 0,timestamp,userId,hw_released,hw_device_id,battery_voltage,hw_board,hw_cpu_name,hw_screen_aspect_ratio,hw_screen_class,battery_type,...,sim_operator_name,hw_camera_front_optical_stabilization,hw_camera_front_resolution,hw_camera_back_resolution,sim_operator,hw_camera_back_longest_exposure_value,hw_camera_back_supported_resolution,hw_capacity_force_touch,os_multitasking,cellular_capabilities
0,1596244856,170425000002238,Tue Jul 07 08:50:39 GMT+07:00 2020,f21ebb2f9064b16e,4091 mV,exynos9810,exynos9810,2,Normal,Li-ion,...,,,,,,,,,,
1,1596244830,191113586003363,Thu Jul 02 12:19:14 GMT+07:00 2020,13cf342d200936c8,4260 mV,AGS2,hi6250,1,Large,Li-poly,...,,,,,,,,,,
2,1596244888,200413000001404,Thu Dec 07 11:24:05 GMT+07:00 2017,b41f717314855504,3508 mV,SC9830I,sc8830,1,Normal,Li-ion,...,vn,NO,4 MP,7 MP,45204.0,,3264x2448\n2592x1936\n2048x1536\n1600x1200\n12...,,,
3,1596244831,200428000037320,Wed Apr 22 07:53:33 GMT+07:00 2020,8d36707898c6955d,3877 mV,exynos9810,samsungexynos9810,2,Normal,Li-ion,...,,,,,,,,,,
4,1596244849,190309000002524,Thu Mar 12 12:05:05 GMT+07:00 2020,af1934dafd8aba01,3644 mV,mt6757,mt6757,1,Normal,Li-ion,...,,NO,15 MP,12 MP,,,4128x3096\n4128x2320\n3264x1836\n3264x2448\n30...,,,


In [6]:
required_cols = ['timestamp', 'userId', 'hw_released', 'hw_board', 'hw_screen_class', 'hw_cpu_supported_abis', 'hw_device_id', 'hw_cpu_name',
       'hw_screen_aspect_ratio', 'hw_screen_pixel_density', 'system_packages', 'battery_type', ''
       'network_wifi_mac_address', 'hw_screen_resolution', 'os_version',
       'hw_device_manufacturer', 'hw_cpu_speed', 'hw_device_model',
       'hw_screen_refresh_rate', 'os_name', 'os_root_access',
       'hw_cpu_supported_64_bit_abis', 'user_agent', 'hw_device_string',
       'os_version_name', 'hw_cpu_core_count', 'hw_cpu_supported_32_bit_abis',
       'hw_cpu_processor', 'hw_cpu_manufacturer', 'hw_screen_size',
       'hw_bluetooth_address', 'installed_packages', 'hw_ram_total',
       'hw_cpu_min_speed', 'hw_storage_total', 'list_of_wifi',
       'hw_camera_front_max_photo_resolution',
       'hw_camera_back_max_video_resolution', 'ringtone',
       'hw_camera_number_camera', 'hw_camera_back_max_photo_resolution',
       'hw_camera_front_max_video_resolution']

In [7]:
df = new_df[required_cols]

In [8]:
df.duplicated().value_counts()

False    495535
True         39
dtype: int64

In [9]:
df = df.drop_duplicates()

## Device Transaction

In [10]:
app_id = pd.read_excel('raw/AppID Merchant.xlsx')
app_id = app_id[['Merchant', 'App ID']]
app_id.columns = ['merchant', 'appid']

# format to snake case
# app_id['merchant'] = app_id['merchant'].apply(lambda x: '_'.join([c for c in x.replace(',', '').lower().split(' ') if c.isalnum()]))
app_id.head()

Unnamed: 0,merchant,appid
0,App Test,3
1,App Test,742
2,App Test,10033
3,App Test,10034
4,Business Service,26


In [11]:
columns = ['deviceID', 'transID', 'transType', 'appid', 'pmcID', 'amount', 'userID', 'osVer', 'deviceModel', 'userIP', 'reqDate', 'longitude', 'latitude', 'campaignID', 'type_transaction']
tpe = pd.read_csv('raw/tpe_device_score.csv', names=columns)
tpe.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment


In [12]:
joined_df = tpe.join(app_id.set_index('appid'), how='inner', on='appid')
joined_df.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction,merchant
0,000195863904dc8a,200808000071150,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 08:51:08.856,105.952103,10.22381,0,payment,Money Transfer
1,000195863904dc8a,200808000089357,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-08 09:47:11.279,105.952107,10.223811,0,payment,Money Transfer
2,000195863904dc8a,200809000160928,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-09 14:43:45.359,105.952099,10.223812,0,payment,Money Transfer
3,000195863904dc8a,200822000188187,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 14:33:30.566,105.952106,10.223818,0,payment,Money Transfer
4,000195863904dc8a,200822000271777,1,454,39,2000000,200119000040347,Android 28 (9),Samsung SM-J730G,125.214.48.174,2020-08-22 20:30:36.476,105.95209,10.223808,0,payment,Money Transfer


In [13]:
# Check nan
joined_df.isna().count()

deviceID            2164765
transID             2164765
transType           2164765
appid               2164765
pmcID               2164765
amount              2164765
userID              2164765
osVer               2164765
deviceModel         2164765
userIP              2164765
reqDate             2164765
longitude           2164765
latitude            2164765
campaignID          2164765
type_transaction    2164765
merchant            2164765
dtype: int64

In [14]:
# Check nan
joined_df.isna().count()

deviceID            2164765
transID             2164765
transType           2164765
appid               2164765
pmcID               2164765
amount              2164765
userID              2164765
osVer               2164765
deviceModel         2164765
userIP              2164765
reqDate             2164765
longitude           2164765
latitude            2164765
campaignID          2164765
type_transaction    2164765
merchant            2164765
dtype: int64

In [15]:
joined_df[joined_df['deviceID'] == '000195863904dc8a'].groupby(['userID', 'merchant'])['appid'].count()

userID           merchant      
200119000040347  Digital Goods      2
                 Money Transfer    16
Name: appid, dtype: int64

# Insert data into ArangoDB

## Add users

In [16]:
unique_dp_users = df['userId'].unique()
unique_tpe_users = joined_df['userID'].unique()
len(unique_dp_users), len(unique_tpe_users)

(409836, 236570)

In [17]:
# In tpe but not in dp
len(set(unique_tpe_users) - set(unique_dp_users))

1549

In [18]:
unique_total_users = list(set(unique_dp_users) | set(unique_tpe_users))

In [19]:
user_list = []
for u in tqdm(unique_total_users):
    user_list.append({
        '_key': str(u)
    })

100%|██████████| 411385/411385 [00:00<00:00, 650456.01it/s]


In [20]:
with open('json/users.json', 'w') as f:
    json.dump(user_list, f)

In [21]:
# Convert json to jsonl
!jq -c ".[]" json/users.json > jsonl/users.jsonl

In [33]:
# import to arangodbarangoimport --create-collection true --collection users --type jsonl --file "jsonl/users.jsonl" --progress true --threads 16
!arangoimport --server.password=ngph  --create-collection true --collection users --type jsonl --file "jsonl/users.jsonl" --progress true --threads 16

## Add latest devices info

In [54]:
devices_latest = df[df.groupby(['hw_device_id'])['timestamp'].transform(max) == df['timestamp']]

In [55]:
def format_packages(x):
    return x.split(', ') if type(x) is not float else 'nan'

In [56]:
def format_list_of_wifi(x):
    if type(x) is not float and x != '':
        return x.replace('"', '').split(', ')
    else:
        return 'nan'

In [57]:
def format_ringtone(x):
    return x.strip(',').split(',') if type(x) is not float else 'nan'

In [58]:
def create_device_info_doc(row, cols_exclude_devices):
    e = {
            '_id': row['hw_device_id'],
            '_key': row['hw_device_id'],
            'installed_packages': format_packages(row['installed_packages']),
            'system_packages': format_packages(row['system_packages']),
            'list_of_wifi': format_list_of_wifi(row['list_of_wifi']),
            'ringtone': format_ringtone(row['ringtone'])
        }
    for c in cols_exclude_devices:
        e[c] = str(row[c])
    return e

In [63]:
device_info = []
cols_exclude_devices = set(df.columns) - set(['hw_device_id', 'installed_packages', 'system_packages', 'list_of_wifi', 'ringtone'])
devices_latest['arangoCode'] = devices_latest.apply(lambda x: create_device_info_doc(x, cols_exclude_devices) , 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  devices_latest['arangoCode'] = devices_latest.apply(lambda x: create_device_info_edge(x, cols_exclude_devices) , 1)


In [65]:
devices_latest_list = devices_latest['arangoCode'].tolist()

In [66]:
with open('json/devices.json', 'w') as f:
    json.dump(devices_latest_list, f)

In [67]:
# Convert json to jsonl
!jq -c ".[]" json/devices.json > jsonl/devices.jsonl

In [68]:
!arangoimport --server.password=ngph  --create-collection true --collection devices --type jsonl --file "jsonl/devices.jsonl" --progress true --threads 16

Connected to ArangoDB 'http+tcp://127.0.0.1:8529, version: 3.7.2, database: '_system', username: 'root'
----------------------------------------
database:               _system
collection:             devices
create:                 yes
create database:        no
source filename:        jsonl/devices.jsonl
file type:              jsonl
threads:                16
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
[0m2020-09-21T15:00:32Z [12144] INFO [9ddf3] processed 34175981 bytes (3%) of input file
[0m[0m2020-09-21T15:00:34Z [12144] INFO [9ddf3] processed 68319195 bytes (6%) of input file
[0m[0m2020-09-21T15:00:35Z [12144] INFO [9ddf3] processed 102495176 bytes (9%) of input file
[0m[0m2020-09-21T15:00:37Z [12144] INFO [9ddf3] processed 136638390 bytes (12%) of input file
[0m[0m2020-09-21T15:00:38Z [12144] INFO [9ddf3] processed 170814371 bytes (15%) of input file
[0m[0m2020-09-21T15:00:40Z [12144] INFO [9

## Add users - devices 

In [81]:
def create_user_device_edge(row, cols_exclude_users_devices):
    e = {
            '_key': f"{str(row['userId']).strip()}_{str(row['hw_device_id'])}_{row['timestamp']}",
            '_from': 'users/' + str(row['userId']).strip(),
            '_to': 'devices/' + str(row['hw_device_id']).strip(),
            'type': 'user_use_device'
        }
    for c in cols_exclude_users_devices:
        e[c] = str(row[c]).strip()
    return e

In [82]:
cols_exclude_users_devices = set(df.columns) - set(['userId', 'hw_device_id', 'installed_packages', 'system_packages', 'list_of_wifi', 'ringtone'])
df['arangoCode'] = df.apply(lambda row: create_user_device_edge(row, cols_exclude_users_devices), 1)

In [83]:
user_devices_list = df['arangoCode'].tolist()

In [85]:
with open('json/users_use_devices.json', 'w') as f:
    json.dump(user_devices_list, f)

In [86]:
# Convert json to jsonl
!jq -c ".[]" json/users_use_devices.json > jsonl/users_use_devices.jsonl

In [87]:
!arangoimport --server.password=ngph --create-collection true --create-collection-type edge --collection users_devices  --type jsonl --file "jsonl/users_use_devices.jsonl" --progress true --threads 16

Connected to ArangoDB 'http+tcp://127.0.0.1:8529, version: 3.7.2, database: '_system', username: 'root'
----------------------------------------
database:               _system
collection:             users_devices
create:                 yes
create database:        no
source filename:        jsonl/users_use_devices.jsonl
file type:              jsonl
threads:                16
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
[0m2020-09-21T15:10:59Z [12716] INFO [9ddf3] processed 21593453 bytes (3%) of input file
[0m[0m2020-09-21T15:11:00Z [12716] INFO [9ddf3] processed 43186906 bytes (6%) of input file
[0m[0m2020-09-21T15:11:01Z [12716] INFO [9ddf3] processed 64747592 bytes (9%) of input file
[0m[0m2020-09-21T15:11:02Z [12716] INFO [9ddf3] processed 86341045 bytes (12%) of input file
[0m[0m2020-09-21T15:11:03Z [12716] INFO [9ddf3] processed 107934498 bytes (15%) of input file
[0m[0m2020-09-21T15:11:04Z [

## Devices model

In [16]:
unique_device_model = df['hw_device_model'].apply(lambda x: x.lower().replace('-', '_').replace(' ', '_')).unique()
dm_list = []
for uq in unique_device_model:
    dm_list.append({
        '_key': uq
    })

In [17]:
with open('json/device_model.json', 'w') as f:
    json.dump(dm_list, f)

In [18]:
# Convert json to jsonl
!jq -c ".[]" json/device_model.json > jsonl/device_model.jsonl

In [21]:
!arangoimport --server.password=ngph --collection device_model --create-collection true --type jsonl --file "jsonl/device_model.jsonl" --progress true --threads 16

Connected to ArangoDB 'http+tcp://127.0.0.1:8529, version: 3.7.2, database: '_system', username: 'root'
----------------------------------------
database:               _system
collection:             device_model
create:                 yes
create database:        no
source filename:        jsonl/device_model.jsonl
file type:              jsonl
threads:                16
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
[0m2020-09-21T14:42:03Z [10700] INFO [9ddf3] processed 24309 bytes (3%) of input file
[0m
created:          1200
updated/replaced: 0
ignored:          0


## Device - DeviceModel

In [22]:
ddm_df = df.groupby(['hw_device_id', 'hw_device_model'], as_index=False).timestamp.max()
ddm_df.head()

Unnamed: 0,hw_device_id,hw_device_model,timestamp
0,000081FB-CAE0-4DBE-8983-D54759750014,iPhone 6s,1596520549
1,0000B1B7-9075-487E-B363-0FF4AE8FF05C,iPhone 7 Plus,1598881038
2,0000B625-66C9-4B23-8509-67B50D0737AE,iPhone 11,1597714312
3,000103C2-C989-435C-8E5E-15A0D9E03199,iPhone 6s,1596728757
4,00017D00-6F9E-4956-AB4C-FEA04F5115F1,iPhone 6,1598836556


In [23]:
def create_edge(row):
    return {
            '_from': 'devices/' + str(row['hw_device_id']),
            '_to': 'device_model/' + str(row['hw_device_model'].lower().replace('-', '_').replace(' ', '_')),
            'timestamp': str(row['timestamp'])
    }

In [24]:
ddm_df['arangoCode'] = ddm_df.apply(create_edge,1)

In [25]:
ddm_list = ddm_df['arangoCode'].tolist()

In [26]:
with open('json/device_devicemodel.json', 'w') as f:
    json.dump(ddm_list, f)

In [27]:
# Convert json to jsonl
!jq -c ".[]" json/device_devicemodel.json > jsonl/device_devicemodel.jsonl

In [30]:
!arangoimport --server.password=ngph --collection device_devicemodel --create-collection true --create-collection-type edge --type jsonl --file "jsonl/device_devicemodel.jsonl" --progress true --threads 16

Connected to ArangoDB 'http+tcp://127.0.0.1:8529, version: 3.7.2, database: '_system', username: 'root'
----------------------------------------
database:               _system
collection:             device_devicemodel
create:                 yes
create database:        no
source filename:        jsonl/device_devicemodel.jsonl
file type:              jsonl
threads:                16
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
[0m2020-09-21T14:44:19Z [10943] INFO [9ddf3] processed 1408981 bytes (3%) of input file
[0m[0m2020-09-21T14:44:19Z [10943] INFO [9ddf3] processed 2817962 bytes (6%) of input file
[0m[0m2020-09-21T14:44:19Z [10943] INFO [9ddf3] processed 4194176 bytes (9%) of input file
[0m[0m2020-09-21T14:44:19Z [10943] INFO [9ddf3] processed 5603157 bytes (12%) of input file
[0m[0m2020-09-21T14:44:19Z [10943] INFO [9ddf3] processed 6979371 bytes (15%) of input file
[0m[0m2020-09-21T14:44:19Z [

## Device Merchant TPE Activity

In [16]:
def create_user_device_tpe_edge(row, cols_excluded):
    new_dm = {
            '_key': str(row['transID']),
            '_to': 'devices/' + str(row['deviceID']),
            '_from': 'users/' + str(row['userID']),
            'type': 'transaction'
        }
    for c in cols_excluded:
        new_dm[c] = str(row[c])
    return new_dm

In [17]:
cols_excluded = set(joined_df.columns) - set(['deviceID', 'userID'])
joined_df['arangoCode'] = joined_df.apply(lambda row: create_user_device_tpe_edge(row, cols_excluded), 1)

In [19]:
tpe = joined_df['arangoCode'].tolist()

In [20]:
with open('json/tpe.json', 'w') as f:
    json.dump(tpe, f)

In [21]:
# Convert json to jsonl
!jq -c ".[]" json/tpe.json > jsonl/tpe.jsonl

In [22]:
!arangoimport --server.password=ngph --collection users_devices --type jsonl --file "jsonl/tpe.jsonl" --progress true --threads 16

Connected to ArangoDB 'http+tcp://127.0.0.1:8529, version: 3.7.2, database: '_system', username: 'root'
----------------------------------------
database:               _system
collection:             users_devices
create:                 no
create database:        no
source filename:        jsonl/tpe.jsonl
file type:              jsonl
threads:                16
connect timeout:        5
request timeout:        1200
----------------------------------------
Starting JSON import...
[0m2020-09-21T15:48:33Z [15949] INFO [9ddf3] processed 28671125 bytes (3%) of input file
[0m[0m2020-09-21T15:48:35Z [15949] INFO [9ddf3] processed 57309483 bytes (6%) of input file
[0m[0m2020-09-21T15:48:37Z [15949] INFO [9ddf3] processed 85947841 bytes (9%) of input file
[0m[0m2020-09-21T15:48:39Z [15949] INFO [9ddf3] processed 114586199 bytes (12%) of input file
[0m[0m2020-09-21T15:48:41Z [15949] INFO [9ddf3] processed 143257324 bytes (15%) of input file
[0m[0m2020-09-21T15:48:44Z [15949] INFO [9

# Test

In [10]:
df.head()

Unnamed: 0,timestamp,userId,hw_released,hw_device_id,hw_cpu_name,hw_screen_aspect_ratio,hw_screen_pixel_density,system_packages,network_wifi_mac_address,hw_screen_resolution,...,hw_ram_total,hw_cpu_min_speed,hw_storage_total,list_of_wifi,hw_camera_front_max_photo_resolution,hw_camera_back_max_video_resolution,ringtone,hw_camera_number_camera,hw_camera_back_max_photo_resolution,hw_camera_front_max_video_resolution
0,1596244856,170425000002238,Tue Jul 07 08:50:39 GMT+07:00 2020,f21ebb2f9064b16e,exynos9810,2,420 Dpi,"com.samsung.android.provider.filterprovider, c...",12:2F:14:33:9D:E7,1080x2400,...,"7,94 GB",455000Hz,118 GB,,,,,,,
1,1596244830,191113586003363,Thu Jul 02 12:19:14 GMT+07:00 2020,13cf342d200936c8,hi6250,1,320 Dpi,"com.huawei.hifolder, com.android.cts.priv.ctss...",A8:E5:44:0E:BD:BB,1200x1920,...,"2,88 GB",480000Hz,"25,23 GB","""Dinh1"", ""MaiCa"", ""NhuQuynh"", ""Support"", ""Nam ...",,,,,,
2,1596244888,200413000001404,Thu Dec 07 11:24:05 GMT+07:00 2017,b41f717314855504,sc8830,1,320 Dpi,"com.monotype.android.font.rosemary, com.sec.an...",AC:AF:B9:61:36:C8,720x1280,...,"1,33 GB",768000Hz,"4,79 GB","""Xiaomi_70BE_8F41"", ""COM TAM SONG ANH"", ""Redmi...",2576x1932,3264x2448,"Basic Bell,Basic Tone,Beep Once,Beep-Beep,Begi...",2.0,3264x2448,2576x1932
3,1596244831,200428000037320,Wed Apr 22 07:53:33 GMT+07:00 2020,8d36707898c6955d,samsungexynos9810,2,420 Dpi,"com.samsung.android.provider.filterprovider, c...",24:18:1D:FE:E3:E0,1080x2220,...,"5,89 GB",455000Hz,120 GB,,,,,,,
4,1596244849,190309000002524,Thu Mar 12 12:05:05 GMT+07:00 2020,af1934dafd8aba01,mt6757,1,420 Dpi,"com.samsung.android.provider.filterprovider, c...",A4:6C:F1:C3:34:4E,1080x1920,...,3.92 GB,247000Hz,26.75 GB,"""NamCoffee&Milktea"", ""YÊU MỘT NGƯỜI"", ""Studio1...",4608x3456,4128x3096,,2.0,4128x3096,4608x3456


In [15]:
df.groupby(['userId']).agg({'hw_device_id': 'count'}).sort_values(by="hw_device_id")

Unnamed: 0_level_0,hw_device_id
userId,Unnamed: 1_level_1
160514000003501,1
200124000031214,1
200124000031205,1
200124000031090,1
200124000031055,1
...,...
191101679002231,67
171106000000266,68
190224000013026,86
200629000000643,93


### Device Connects To Users Count

In [12]:
df.groupby('hw_device_id').agg({'userId': lambda x: len(set(x))}).sort_values('userId', ascending=False)[:10]

Unnamed: 0_level_0,userId
hw_device_id,Unnamed: 1_level_1
460968F8-068F-41C8-B130-2F5F7E968C9C,49
A5AA40DE-35CB-4406-B4E9-B10E497766A0,35
EDF10704-E7E4-4CC6-BA25-9A30C7720D02,32
027894A1-E617-4A74-9035-33E36FE16800,32
0CA30A94-6251-4727-8340-9B6BE942AACB,31
F8E47BD8-EA84-4621-94C8-104B535C3097,21
51E7555D-BC41-483A-B627-7FD3A6693E4E,20
3C9D6E46-A3C6-456E-BB39-48C0154C7F5A,20
3A8BA073-A02B-47E1-81EF-C69A62A879C2,19
D9E7640D-A830-4CDB-B653-DE1F1F3D2FC6,18


### Device connects with many types of merchants

In [52]:
joined_df.groupby('deviceID').agg({'merchant': lambda x: len(set(x))}).sort_values('merchant')

Unnamed: 0_level_0,merchant
deviceID,Unnamed: 1_level_1
000081FB-CAE0-4DBE-8983-D54759750014,1
C16652ED-8950-4035-A4D5-241F48739561,1
5CAB69E8-804E-49E5-96C2-72B65AD5D587,1
C1666B77-C749-4303-9D86-7040AB26AEAC,1
5CAB04E3-859C-4A31-98A2-B3BE01FBB3C2,1
...,...
5587B913-6EFA-4746-8BCC-0FE3CCE62805,9
2D4AE78B-F09A-4315-B8C6-9CB747C09B93,9
AB33242F-4607-4E79-9EA0-4D1373E7927C,10
0E8FAB1A-53ED-4E1E-AB1D-D1065821ED07,11


In [19]:
from datetime import datetime

In [31]:
joined_df['merchant'].unique()

array(['Money Transfer', 'Digital Goods', 'Utility', 'Transportation',
       'Marketplace', 'Entertainment', 'Food and Beverage',
       'Retail Outlet', 'Supermarket & Convenience store',
       'Business Service', 'Travel', 'Electronic Store', 'Clothing',
       'Game', 'Lodging, Hotels & Accommodations', 'App Test'],
      dtype=object)

In [30]:
joined_df[joined_df['transType'] == 6]

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction,merchant
730,12548543-8A3B-4A93-A7B7-09FE276DBF0C,200818000228991,6,6,40,500000,180704000005138,13.3.1,"iPhone11,6",14.241.142.44,2020-08-18 15:53:08.571,,,0,sending_redpacket,Money Transfer
2414,3AADCCF3-3766-4926-A65D-35AB3FF52267,200804000718259,6,6,40,10000,200628000000434,11.4.1,"iPhone7,2",58.186.61.34,2020-08-04 22:38:34.277,,,0,sending_redpacket,Money Transfer
5788,95bb6f8cf3cbe65e,200821000371227,6,6,40,100000,180210000006997,Android 28 (9),OPPO CPH2073,113.168.50.179,2020-08-21 23:25:22.751,,,0,sending_redpacket,Money Transfer
7605,C91AA6A0-5572-49F7-93E9-6095BB0371E0,200809000002489,6,6,40,200000,190701000002048,12.4.7,"iPhone7,2",171.236.69.74,2020-08-09 00:06:25.329,,,0,sending_redpacket,Money Transfer
8784,E451FE8C-63D8-48A2-AE70-1419F2E47B22,200803000525477,6,6,40,1000,180825000002178,13.3,"iPhone11,8",113.185.41.116,2020-08-03 20:07:20.467,,,0,sending_redpacket,Money Transfer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154140,1C38ACA5-B05A-414B-AEDE-274EA5AD38B6,200804000106977,6,6,40,5000,190207000012296,13.5.1,"iPhone12,1",171.253.29.255,2020-08-04 07:58:00.907,,,0,sending_redpacket,Money Transfer
2159730,8E5EBB61-784B-4B1D-AF60-FD371B945127,200827000033180,6,6,40,2000,180815000000022,13.3.1,"iPhone9,4",113.167.26.180,2020-08-27 06:09:18.258,,,0,sending_redpacket,Money Transfer
2159993,975E23DA-34B1-46F6-B9D2-4A79F23140CB,200822000141049,6,6,40,2000,191229000012821,13.6,"iPhone12,5",27.64.43.13,2020-08-22 12:14:11.073,,,0,sending_redpacket,Money Transfer
2161456,BB831245-F719-4C97-AB5E-9852FB09CD47,200825000194074,6,6,40,600000,181024000003791,12.0.1,"iPhone7,2",171.255.66.229,2020-08-25 14:17:21.426,,,0,sending_redpacket,Money Transfer


In [61]:
amount_temp = joined_df[joined_df['deviceID'] == 'A321541E-966B-485B-87C2-9783FB0457F9' & joined_df['transType']]
amount_temp.head()

Unnamed: 0,deviceID,transID,transType,appid,pmcID,amount,userID,osVer,deviceModel,userIP,reqDate,longitude,latitude,campaignID,type_transaction,merchant
1951685,A321541E-966B-485B-87C2-9783FB0457F9,200820000079622,1,454,37,200000,190419000000211,12.1.3,"iPhone8,2",103.245.252.75,2020-08-20 09:00:58.437,0.0,0.0,0,payment,Money Transfer
1951708,A321541E-966B-485B-87C2-9783FB0457F9,200807000252754,1,454,37,100000,190419000000211,12.1.3,"iPhone8,2",103.245.252.19,2020-08-07 15:47:19.774,0.0,0.0,0,payment,Money Transfer
1951710,A321541E-966B-485B-87C2-9783FB0457F9,200809000187436,1,454,37,100000,190419000000211,12.1.3,"iPhone8,2",14.169.34.58,2020-08-09 16:42:30.015,0.0,0.0,0,payment,Money Transfer
1951724,A321541E-966B-485B-87C2-9783FB0457F9,200821000100989,1,454,37,200000,190419000000211,12.1.3,"iPhone8,2",103.245.252.76,2020-08-21 09:35:06.770,106.745651,10.757929,0,payment,Money Transfer
1951730,A321541E-966B-485B-87C2-9783FB0457F9,200824000061319,1,454,37,200000,190419000000211,12.1.3,"iPhone8,2",103.245.252.76,2020-08-24 08:33:22.159,106.74563,10.757932,0,payment,Money Transfer


In [63]:
amount_temp['reqDate'] = amount_temp['reqDate'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f").date())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amount_temp['reqDate'] = amount_temp['reqDate'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f").date())


In [65]:
amount_temp.groupby('reqDate').agg({'amount': 'sum'})

Unnamed: 0_level_0,amount
reqDate,Unnamed: 1_level_1
2020-08-03,138000
2020-08-04,153500
2020-08-05,170500
2020-08-07,184500
2020-08-08,36000
2020-08-09,140000
2020-08-10,78200
2020-08-11,57800
2020-08-13,84000
2020-08-14,310300


In [50]:
df.loc[df['os_name'] == 'android'].groupby('hw_device_id').userId.count().sort_values()

hw_device_id
64fc764c6141f8bb     1
83f5548c361f0c09     1
83f54287569ea81c     1
83f32a1b10c12b63     1
83ef5adcc755dcd0     1
                    ..
ca06a6418f2482ed    29
d775a3556b9ab2d6    30
45625b9901e2bc45    33
13942623531b1456    37
213d8639c4acde66    49
Name: userId, Length: 26843, dtype: int64