# Preprocess Data

In [6]:
import pandas as pd
from pyArango.connection import Connection
import json

In [7]:
android_df = pd.read_csv('dataTest_device_profile_android.csv')
ios_df = pd.read_csv('dataTest_device_profile.csv')
joint_df = pd.concat([android_df, ios_df])
joint_df['params'] = joint_df['params'].apply(lambda x: json.loads(x))
joint_df.head()

Unnamed: 0,timestamp,userId,params
0,1596244856,170425000002238,{'hw_released': 'Tue Jul 07 08:50:39 GMT+07:00...
1,1596244830,191113586003363,{'hw_released': 'Thu Jul 02 12:19:14 GMT+07:00...
2,1596244888,200413000001404,{'hw_released': 'Thu Dec 07 11:24:05 GMT+07:00...
3,1596244831,200428000037320,{'hw_released': 'Wed Apr 22 07:53:33 GMT+07:00...
4,1596244849,190309000002524,{'hw_released': 'Thu Mar 12 12:05:05 GMT+07:00...


In [8]:
formatted = pd.json_normalize(joint_df['params'])

In [9]:
joint_df.reset_index(drop=True, inplace=True)

formatted.reset_index(drop=True, inplace=True)

In [10]:
new_df = pd.concat([joint_df[['timestamp', 'userId']], formatted], axis=1)
new_df.drop_duplicates(inplace=True)
new_df.head()

Unnamed: 0,timestamp,userId,hw_released,hw_device_id,battery_voltage,hw_board,hw_cpu_name,hw_screen_aspect_ratio,hw_screen_class,battery_type,...,sim_operator_name,hw_camera_front_optical_stabilization,hw_camera_front_resolution,hw_camera_back_resolution,sim_operator,hw_camera_back_longest_exposure_value,hw_camera_back_supported_resolution,hw_capacity_force_touch,os_multitasking,cellular_capabilities
0,1596244856,170425000002238,Tue Jul 07 08:50:39 GMT+07:00 2020,f21ebb2f9064b16e,4091 mV,exynos9810,exynos9810,2,Normal,Li-ion,...,,,,,,,,,,
1,1596244830,191113586003363,Thu Jul 02 12:19:14 GMT+07:00 2020,13cf342d200936c8,4260 mV,AGS2,hi6250,1,Large,Li-poly,...,,,,,,,,,,
2,1596244888,200413000001404,Thu Dec 07 11:24:05 GMT+07:00 2017,b41f717314855504,3508 mV,SC9830I,sc8830,1,Normal,Li-ion,...,vn,NO,4 MP,7 MP,45204.0,,3264x2448\n2592x1936\n2048x1536\n1600x1200\n12...,,,
3,1596244831,200428000037320,Wed Apr 22 07:53:33 GMT+07:00 2020,8d36707898c6955d,3877 mV,exynos9810,samsungexynos9810,2,Normal,Li-ion,...,,,,,,,,,,
4,1596244849,190309000002524,Thu Mar 12 12:05:05 GMT+07:00 2020,af1934dafd8aba01,3644 mV,mt6757,mt6757,1,Normal,Li-ion,...,,NO,15 MP,12 MP,,,4128x3096\n4128x2320\n3264x1836\n3264x2448\n30...,,,


In [11]:
required_cols = ['timestamp', 'userId', 'hw_released', 'hw_device_id', 'hw_cpu_name',
       'hw_screen_aspect_ratio', 'hw_screen_pixel_density', 'system_packages',
       'network_wifi_mac_address', 'hw_screen_resolution', 'os_version',
       'hw_device_manufacturer', 'hw_cpu_speed', 'hw_device_model',
       'hw_screen_refresh_rate', 'os_name', 'os_root_access',
       'hw_cpu_supported_64_bit_abis', 'user_agent', 'hw_device_string',
       'os_version_name', 'hw_cpu_core_count', 'hw_cpu_supported_32_bit_abis',
       'hw_cpu_processor', 'hw_cpu_manufacturer', 'hw_screen_size',
       'hw_bluetooth_address', 'installed_packages', 'hw_ram_total',
       'hw_cpu_min_speed', 'hw_storage_total', 'list_of_wifi',
       'hw_camera_front_max_photo_resolution',
       'hw_camera_back_max_video_resolution', 'ringtone',
       'hw_camera_number_camera', 'hw_camera_back_max_photo_resolution',
       'hw_camera_front_max_video_resolution']

In [12]:
df = new_df[required_cols]

In [13]:
df.duplicated().value_counts()

False    495535
True         39
dtype: int64

In [14]:
df = df.drop_duplicates()

# Connect to ArangoDB, DB and collections

In [15]:
conn = Connection(username='root', password='ngph')
db = conn['final']
users = db['users']
devices = db['devices']
user_device = db['user_device']
device_deviceModel = db['device_devicemodel']
device_model = db['device_model']

# Insert data into ArangoDB

## Add users

In [52]:
unique_users = df['userId'].unique()

In [None]:
for uu in unique_users:
    users.createDocument({
        '_key': uu
    }).save()

## Add latest devices info

In [60]:
devices_latest = df[df.groupby(['hw_device_id'])['timestamp'].transform(max) == df['timestamp']]

In [69]:
def format_packages(x):
    return x.split(', ') if type(x) is not float else 'nan'

In [93]:
def format_list_of_wifi(x):
    if type(x) is not float and x != '':
        return x.replace('"', '').split(', ')
    else:
        return 'nan'

In [86]:
def format_ringtone(x):
    return x.strip(',').split(',') if type(x) is not float else 'nan'

In [95]:
cols_exclude_devices = set(df.columns) - set(['hw_device_id', 'installed_packages', 'system_packages', 'list_of_wifi', 'ringtone'])
for _, row in devices_latest.iterrows():
    nd = devices.createDocument({
        '_key': row['hw_device_id'],
        'installed_packages': format_packages(row['installed_packages']),
        'system_packages': format_packages(row['system_packages']),
        'list_of_wifi': format_list_of_wifi(row['list_of_wifi']),
        'ringtone': format_ringtone(row['ringtone'])
    })
    for c in cols_exclude_devices:
        nd[c] = str(row[c])
    nd.save()

## Add users - devices

In [96]:
cols_exclude_users_devices = set(df.columns) - set(['userId', 'hw_device_id', 'installed_packages', 'system_packages', 'list_of_wifi', 'ringtone'])
for _, row in df.iterrows():
    ud = user_device.createDocument({
        '_from': 'users/' + str(row['userId']),
        '_to': 'devices/' + str(row['hw_device_id'])
    })
    for c in cols_exclude_users_devices:
        ud[c] = str(row[c])
    ud.save()

## Devices model

In [None]:
unique_device_model = df['hw_device_model'].unique()
inserted = []
for uq in unique_device_model:
    formatted = '_'.join(x for x in uq.split(' '))
    if formatted not in inserted:
        inserted.append(formatted)
        device_model.createDocument({
            '_key': formatted
        }).save()

## Device - DeviceModel

In [16]:
for _, row in df.iterrows():
    formatted_device_model = '_'.join(x for x in row['hw_device_model'].split(' '))
    new_ddm = device_deviceModel.createDocument({
        '_from': 'devices/' + str(row['hw_device_id']),
        '_to': 'device_model/' + formatted_device_model,
        'timestamp': str(row['timestamp']),
        'userId': str(row['userId'])
    })
    new_ddm.save()