In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import sklearn
import tqdm
from sklearn.model_selection import train_test_split

In [4]:
df_usage = pd.read_csv('data/App_usage_trace.txt', sep=' ', names=['user', 'time', 'location', 'app', 'traffic'])
df_usage = df_usage[['user', 'time', 'app']]
df_usage

Unnamed: 0,user,time,app
0,0,20160420081319,361
1,0,20160420081320,361
2,0,20160420081322,361
3,0,20160420081330,361
4,0,20160420081331,361
...,...,...,...
4171945,999,20160426221917,5
4171946,999,20160426221921,5
4171947,999,20160426222026,5
4171948,999,20160426223413,5


In [5]:
# merging the consecutive usage records of the same app in one minute
df_usage['time'] = df_usage['time'].apply(lambda x : str(x)[:-2])
df_usage.drop_duplicates(inplace=True)

In [6]:
df_usage

Unnamed: 0,user,time,app
0,0,201604200813,361
5,0,201604200816,361
6,0,201604200816,31
7,0,201604200816,360
8,0,201604200816,612
...,...,...,...
4171944,999,201604262218,1
4171945,999,201604262219,5
4171947,999,201604262220,5
4171948,999,201604262234,5


In [21]:
# delete app used less than 10 times for all users
df_usage = df_usage[df_usage.groupby('app')['app'].transform('count').ge(10)]

In [22]:
df_usage.head()

Unnamed: 0,user,time,app
0,0,201604200813,361
5,0,201604200816,361
6,0,201604200816,31
7,0,201604200816,360
8,0,201604200816,612


In [28]:
"""
time1 = datetime.datetime.strptime(df_usage['time'].iloc[1], '%Y%m%d%H%M')
time0 = datetime.datetime.strptime(df_usage['time'].iloc[0], '%Y%m%d%H%M')
(time1 - time0).total_seconds() // 60
"""

"\ntime1 = datetime.datetime.strptime(df_usage['time'].iloc[1], '%Y%m%d%H%M')\ntime0 = datetime.datetime.strptime(df_usage['time'].iloc[0], '%Y%m%d%H%M')\n(time1 - time0).total_seconds() // 60\n"

In [29]:
prev_user = -1
prev_time = -1
app_seq = []
time_seq = []
all_app_seq = []
all_time_seq = []

seq_length = 4

# df_usage is already sorted based on user and time sequence
for i in tqdm.tqdm(range(len(df_usage))):
    user = df_usage.iloc[i]['user']
    app = df_usage.iloc[i]['app']
    time = df_usage.iloc[i]['time']
    time = datetime.datetime.strptime(time, '%Y%m%d%H%M')

    if prev_user != user:
        app_seq = [app]
        time_seq = [time]
        all_app_seq.append([])
        all_time_seq.append([])

    else:
        # same sequence if the time gap between them is equal to or less than 7 mins
        if (time - prev_time).total_seconds() // 60 <= 7:
            if len(app_seq) == seq_length:
                all_app_seq.append(app_seq)
                # 差分时间
                all_time_seq.append([(prev_time - x).total_seconds() // 60 for x in time_seq])
                app_seq = app_seq[1:] + [app]
                time_seq = time_seq[1:] + [time]
            else:
                app_seq.append(app)
                time_seq.append(time)
                all_app_seq.append([])
                all_time_seq.append([])
        else:
            app_seq = [app]
            time_seq = [time]
            all_app_seq.append([])
            all_time_seq.append([])
        
    prev_user = user
    prev_time = time

100%|██████████| 1123955/1123955 [06:34<00:00, 2850.68it/s]


In [33]:
df_usage['app_seq'] = all_app_seq
df_usage['time_seq'] = all_time_seq

# only filled sequences are treated as data
df_usage = df_usage[df_usage['app_seq'].map(len) != 0]
# delete users who have sequences less than 50
df_usage = df_usage[df_usage.groupby('user')['user'].transform('count').ge(50)]

df_usage[:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usage['app_seq'] = all_app_seq
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usage['time_seq'] = all_time_seq


Unnamed: 0,user,time,app,app_seq,time_seq
8,0,201604200816,612,"[361, 361, 31, 360]","[3.0, 0.0, 0.0, 0.0]"
10,0,201604200817,31,"[361, 31, 360, 612]","[0.0, 0.0, 0.0, 0.0]"
13,0,201604200817,360,"[31, 360, 612, 31]","[1.0, 1.0, 1.0, 0.0]"
14,0,201604200817,361,"[360, 612, 31, 360]","[1.0, 1.0, 0.0, 0.0]"
16,0,201604200824,1,"[612, 31, 360, 361]","[1.0, 0.0, 0.0, 0.0]"
17,0,201604200829,31,"[31, 360, 361, 1]","[7.0, 7.0, 7.0, 0.0]"
19,0,201604200829,612,"[360, 361, 1, 31]","[12.0, 12.0, 5.0, 0.0]"
20,0,201604200829,360,"[361, 1, 31, 612]","[12.0, 5.0, 0.0, 0.0]"
22,0,201604200829,4,"[1, 31, 612, 360]","[5.0, 0.0, 0.0, 0.0]"
23,0,201604200831,31,"[31, 612, 360, 4]","[0.0, 0.0, 0.0, 0.0]"


In [34]:
len(df_usage)

908770

In [36]:
# represent time as weekday_time
def prep_time(t):
    t = t[:-2]
    weekday = datetime.datetime.strptime(t[:-2], '%Y%m%d').weekday()
    return '{}_{}'.format(weekday, t[-2:])

df_usage['time'] = df_usage['time'].apply(lambda x : prep_time(x))

In [37]:
df_usage.head()

Unnamed: 0,user,time,app,app_seq,time_seq
8,0,2_08,612,"[361, 361, 31, 360]","[3.0, 0.0, 0.0, 0.0]"
10,0,2_08,31,"[361, 31, 360, 612]","[0.0, 0.0, 0.0, 0.0]"
13,0,2_08,360,"[31, 360, 612, 31]","[1.0, 1.0, 1.0, 0.0]"
14,0,2_08,361,"[360, 612, 31, 360]","[1.0, 1.0, 0.0, 0.0]"
16,0,2_08,1,"[612, 31, 360, 361]","[1.0, 0.0, 0.0, 0.0]"


In [41]:
user2id = {u: i for i, u in enumerate(sorted(df_usage['user'].unique()))}
app_set = set()
for s in df_usage['app_seq'].values:
    app_set.update(s)
app2id = {a: i for i, a in enumerate(sorted(app_set))}

In [45]:
print("user nums: " + str(len(user2id)))
print("app nums: " + str(len(app2id)))

user nums: 748
app nums: 1518


In [47]:
def dict2file(dic, filename):
    with open(filename, 'w') as f:
        for k, v in dic.items():
            f.write("{}\t{}\n".format(k, v))

In [48]:
dict2file(user2id, "data/user2id.txt")
dict2file(app2id, "data/app2id.txt")

In [49]:
df_dataset = pd.DataFrame()
df_dataset['user'] = df_usage['user'].apply(lambda x : user2id[x])
df_dataset['time'] = df_usage['time']
df_dataset['app_seq'] = df_usage['app_seq'].apply(lambda x : [app2id[c] for c in x])
df_dataset['time_seq'] = df_usage['time_seq']
df_dataset['app'] = df_usage['app'].apply(lambda x : app2id[x])

In [50]:
df_dataset.shape

(908770, 5)

In [51]:
df_dataset.head()

Unnamed: 0,user,time,app_seq,time_seq,app
8,0,2_08,"[290, 290, 20, 289]","[3.0, 0.0, 0.0, 0.0]",516
10,0,2_08,"[290, 20, 289, 516]","[0.0, 0.0, 0.0, 0.0]",20
13,0,2_08,"[20, 289, 516, 20]","[1.0, 1.0, 1.0, 0.0]",289
14,0,2_08,"[289, 516, 20, 289]","[1.0, 1.0, 0.0, 0.0]",290
16,0,2_08,"[516, 20, 289, 290]","[1.0, 0.0, 0.0, 0.0]",0


In [52]:
train, test = train_test_split(df_dataset, test_size=0.2, random_state=2021, stratify=df_dataset['user'])
print(train.shape)
print(test.shape)

(727016, 5)
(181754, 5)


In [53]:
train.to_csv('data/train.txt', sep='\t', index=False)
test.to_csv('data/test.txt', sep='\t', index=False)