# Sampling User App Data
Process time: 30 minutes aprox.

- Dimensions generated:
    - Date (YYYY-MM--DD)
    - Time label (integer)

- Metrics generated: 
    - Traffic (bytes) 
    - Duration (seconds)
    - Frequency (sessions)

In [1]:
import os
import glob
import time
import json
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
def read_ConfigFile(file):
    import json
    with open(file, 'rb' ) as f:
        json_str = json.load(f)
    return json_str

In [5]:
def generate_time_labels(sampling_period_in_hours):
    """
    Divides a day in time labels according to the "sampling_period_in_hours" parameter.
    """
    
    hours_in_a_day = 24
    n_time_labels = int(hours_in_a_day/sampling_period_in_hours)
    
    df = pd.Series(range(hours_in_a_day + 1))
    df = df.to_frame()
    df.columns = ['hour']
    
    df['time_label'] = pd.qcut(df['hour'], q=n_time_labels, labels=range(1, n_time_labels + 1))

    return df

In [7]:
def get_user_app_dataset(df, sampling_hours, session_innactive_time):
    """
    Resample "App_usage_trace" dataset according to "sampling_hours". Add a time label for each period.
    Example:
    
        For a sampling of 6 hours:
        
            Time label 1: from  0 am -  6 am
            Time label 2: from  6 am - 12 pm
            Time label 3: from 12 pm -  6 pm
            Time label 4: from  6 pm -  0 am
    
    Metrics created:
    Traffic: How many data a user expends in each app in each time label in a given day?.
    Duration: How much time a user expends in each app in each time label in a given day?.
    Frequency: Number of sessions the user make for each app in each time label in a given day.

    """
    
    
    unique_users_counter = set()
    user_app = pd.DataFrame()
    
    df_time_labels = generate_time_labels(sampling_period_in_hours=sampling_hours)
    
    df['timestamp_diff'] = df['timestamp'].diff()
    groups = df.groupby(['user_id', 'app_id'])

    for indexes, group in groups:
        current_user_id = indexes[0]
        current_app_id  = indexes[1]
        group.sort_values(by=['timestamp'], inplace=True)
        group['timestamp_diff'] = group['timestamp'].diff()
        group['seconds_diff']   = group['timestamp_diff'].apply(lambda x: x.total_seconds())

        # seconds diff
        condition_list = [ group['seconds_diff'] > session_innactive_time]  
        choices_list = [1.0]
        group['total_seconds'] = np.select(condition_list, choices_list, default=group['seconds_diff'])
        group['total_seconds'].fillna(1.0, inplace=True)
        group['hour'] = group['timestamp'].dt.hour 

        group = group.merge(df_time_labels, on='hour', how='inner')
        group['date'] = group['timestamp'].dt.date

        group = group.groupby(['date', 'user_id', 'app_id', 'time_label']).agg(traffic_bytes=('traffic_bytes', 'sum'), 
                                                                               duration_sec=('total_seconds', 'sum'),
                                                                               frequency=('seconds_diff', lambda x: (x > session_innactive_time).sum()),
                                                                              )
        group = group.reset_index()
        mask  = (group['traffic_bytes']>0) & (group['duration_sec']>0)
        group = group[mask]
        group['frequency'].loc[group['frequency'] == 0] = 1

        group.to_csv(f'data/0_user_app_sampling_{sampling_hours}.csv',
                         encoding='utf-8', sep=',', mode='a', 
                         header=not os.path.exists(f'data/0_user_app_sampling_{sampling_hours}.csv'),
                         index=False,
                        )
        
        # printing updates
        users_before = len(unique_users_counter)
        unique_users_counter.add(current_user_id)
        users_after = len(unique_users_counter)
        
        
        if (users_before != users_after):
            print(f"Total users processed: {len(unique_users_counter)}")
            

In [8]:
ConfigFile = read_ConfigFile("ConfigFile.json")

In [9]:
print("SCRIPT 0 - Sampling User App Data")
print()

print("ConfigFile Parameters: ")
for param, value in ConfigFile.items():
    print(param, " : ", value)

SCRIPT 0 - Sampling User App Data

ConfigFile Parameters: 
innactive_time_between_app_session_in_seconds  :  60
sampling_period_in_hours  :  6
potential_high_value_users_file  :  0_potential_high_value_users.json
category_apps_dictionary  :  custom_category_apps_dictionary.json


In [10]:
innactive_time_between_app_session_in_seconds = ConfigFile['innactive_time_between_app_session_in_seconds']
sampling_period_in_hours = ConfigFile['sampling_period_in_hours']

# Input Sample

In [10]:
%%time
app_usage = pd.read_csv("data/0_source_data/App_usage_trace.zip",
                        compression='zip',
                        sep=' ',
                        names=['user_id', 'timestamp', 'base_station_id', 'app_id', 'traffic_bytes'],
                        parse_dates=['timestamp'],
                        #nrows=1000
                       )
app_usage

CPU times: user 2min 48s, sys: 190 ms, total: 2min 48s
Wall time: 2min 48s


Unnamed: 0,user_id,timestamp,base_station_id,app_id,traffic_bytes
0,0,2016-04-20 08:13:19,8194,361,1920
1,0,2016-04-20 08:13:20,8194,361,1764
2,0,2016-04-20 08:13:22,8194,361,4434
3,0,2016-04-20 08:13:30,8194,361,703
4,0,2016-04-20 08:13:31,8194,361,1682
...,...,...,...,...,...
4171945,999,2016-04-26 22:19:17,413,5,2533
4171946,999,2016-04-26 22:19:21,413,5,1361
4171947,999,2016-04-26 22:20:26,413,5,1365
4171948,999,2016-04-26 22:34:13,413,5,1400


In [11]:
#start process
time_start= time.asctime()
print("Start process at: ", time_start)

get_user_app_dataset(df=app_usage, 
                     sampling_hours=sampling_period_in_hours,
                    session_innactive_time=innactive_time_between_app_session_in_seconds)

# end process
time_end = time.asctime()
elapsed_time = pd.to_datetime(time_end)-pd.to_datetime(time_start)
print("End process at: ", time_end)
print("Total elapsed time (min): ", round(elapsed_time.seconds/60, 3))

Start process at:  Thu Dec 23 18:55:29 2021
Total users processed: 1
Total users processed: 2
Total users processed: 3
Total users processed: 4
Total users processed: 5
Total users processed: 6
Total users processed: 7
Total users processed: 8
Total users processed: 9
Total users processed: 10
Total users processed: 11
Total users processed: 12
Total users processed: 13
Total users processed: 14
Total users processed: 15
Total users processed: 16
Total users processed: 17
Total users processed: 18
Total users processed: 19
Total users processed: 20
Total users processed: 21
Total users processed: 22
Total users processed: 23
Total users processed: 24
Total users processed: 25
Total users processed: 26
Total users processed: 27
Total users processed: 28
Total users processed: 29
Total users processed: 30
Total users processed: 31
Total users processed: 32
Total users processed: 33
Total users processed: 34
Total users processed: 35
Total users processed: 36
Total users processed: 37
Tot

# Output Sample

In [125]:
print(f"Sampling: {sampling_period_in_hours} hours")

user_app = pd.read_csv(f"data/0_user_app_sampling_{sampling_period_in_hours}.csv",
                       sep=',',
                       encoding='utf-8',
                       parse_dates=['date'],
                      )
user_app

Unnamed: 0,date,user_id,app_id,time_label,traffic_bytes,duration_sec,frequency
0,2016-04-20,0,1,2,7703,6.0,5.0
1,2016-04-20,0,1,4,26437,37.0,4.0
2,2016-04-21,0,1,2,34645,37.0,9.0
3,2016-04-21,0,1,3,311443,15.0,6.0
4,2016-04-21,0,1,4,226411,168.0,12.0
...,...,...,...,...,...,...,...
217861,2016-04-22,997,1629,3,22458,55.0,1.0
217862,2016-04-22,997,1629,4,52615,93.0,1.0
217863,2016-04-26,997,1629,4,37313,71.0,1.0
217864,2016-04-22,998,622,3,917406,30.0,1.0


# Save potential High Value Users
Note: you can make more filtering according to the business needs before save the list.

In [14]:
potential_high_value_users = {}
potential_high_value_users['big_chinese_city'] = (pd.Series(user_app['user_id'].unique())).to_list()

In [15]:
with open("data/0_potential_high_value_users.json", "w") as fp:
    json.dump(potential_high_value_users,fp)

In [115]:
user_app['user_id']    = user_app['user_id'].astype('str')
user_app['app_id']     = user_app['app_id'].astype('str')
user_app['time_label'] = user_app['time_label'].astype('str')

In [116]:
app2category = pd.read_csv("data/0_source_data/App2Category.zip",
                        compression='zip',
                        sep='\t',
                        names=['app_id', 'category_id'],
                        dtype={'app_id':'int64', 'category_id':'int64'}
                       )
app2category

Unnamed: 0,app_id,category_id
0,1,4
1,2,4
2,3,4
3,4,13
4,5,4
...,...,...
1995,1996,5
1996,1997,14
1997,1998,0
1998,1999,10


In [117]:
categories = pd.read_csv("data/0_source_data/Categorys.zip",
                         compression='zip',
                         sep='\t',
                         names=['category_id', 'english_name'],
                        ).dropna()

categories['category_id'] = categories['category_id'].astype('int64')
categories['english_name'] = categories['english_name'].astype('str')
categories['english_name'] = categories['english_name'].str.replace('\\', '')
categories['english_name'] = categories['english_name'].str.replace("'", "")
categories['english_name'] = categories['english_name'].str.replace("Weath1er", "Weather")
categories

Unnamed: 0,category_id,english_name
0,0,Utilities
1,1,Games
2,2,Entertainment
3,3,News
4,4,Social_Networking
6,5,Shopping
7,6,Finance
8,7,Business
9,8,Travel
10,9,Lifestyle


In [118]:
categories_dictionary = app2category.merge(categories, on='category_id', how='inner')
categories_dictionary

Unnamed: 0,app_id,category_id,english_name
0,1,4,Social_Networking
1,2,4,Social_Networking
2,3,4,Social_Networking
3,5,4,Social_Networking
4,6,4,Social_Networking
...,...,...,...
1995,957,11,Health&Fitness
1996,1331,11,Health&Fitness
1997,1868,11,Health&Fitness
1998,1538,7,Business


In [119]:
categories_dictionary['app_id'] = categories_dictionary['app_id'].astype('str')
categories_dictionary['category_id'] = categories_dictionary['category_id'].astype('str')
categories_dictionary['english_name'] = categories_dictionary['english_name'].astype('str')

In [120]:
# save complete category apps dictionary
categories_dictionary_json = categories_dictionary.groupby('english_name').agg({'app_id': list}).to_dict()
categories_dictionary_json = categories_dictionary_json['app_id']
with open("data/1_category_apps_dictionaries/all_category_apps_dictionary.json", "w") as fp:
    json.dump(categories_dictionary_json,fp, indent=4)

In [121]:
user_app = user_app.merge(categories_dictionary, on='app_id', how='inner')
user_app

Unnamed: 0,date,user_id,app_id,time_label,traffic_bytes,duration_sec,frequency,category_id,english_name
0,2016-04-20,0,1,2,7703,6.0,5.0,4,Social_Networking
1,2016-04-20,0,1,4,26437,37.0,4.0,4,Social_Networking
2,2016-04-21,0,1,2,34645,37.0,9.0,4,Social_Networking
3,2016-04-21,0,1,3,311443,15.0,6.0,4,Social_Networking
4,2016-04-21,0,1,4,226411,168.0,12.0,4,Social_Networking
...,...,...,...,...,...,...,...,...,...
217861,2016-04-22,997,1629,3,22458,55.0,1.0,1,Games
217862,2016-04-22,997,1629,4,52615,93.0,1.0,1,Games
217863,2016-04-26,997,1629,4,37313,71.0,1.0,1,Games
217864,2016-04-22,998,622,3,917406,30.0,1.0,0,Utilities


In [123]:
user_app.to_csv(f'data/0_user_app_sampling_{sampling_period_in_hours}.csv',
                index=False,
                encoding='utf-8', 
                sep=',',
                )