In [1]:
import ast 
import pandas as pd

df_usage = pd.read_csv('data/baseline.txt', sep='\t')
df_usage['app_seq'] = df_usage['app_seq'].apply(ast.literal_eval)
df_usage['recent_apps'] = df_usage['recent_apps'].apply(ast.literal_eval)
df_usage['time'] = df_usage['time'].apply(lambda x: str(x))

### 计算MFU

In [2]:
import tqdm
from collections import Counter

df_mfu = df_usage.copy()

counter = Counter()
fu_apps = []
for i in tqdm.tqdm(range(len(df_mfu))):
    user = df_mfu.iloc[i]['user']
    app = df_mfu.iloc[i]['app']
    if user not in counter:
        counter[user] = Counter(df_mfu.iloc[i]['app_seq'])

    fu_apps.append([x[0] for x in counter[user].most_common(10)])

    if app not in counter[user]:
        counter[user][app] = 1
    else:
        counter[user][app] += 1

df_mfu['mfu'] = fu_apps

100%|██████████| 908770/908770 [04:13<00:00, 3591.14it/s]


### 切分训练测试集

In [3]:
from sklearn.model_selection import train_test_split

df_mfu_train, df_mfu_test = train_test_split(df_mfu, test_size=0.2, random_state=2021, stratify=df_mfu['user'])
df_mfu_test.head()

Unnamed: 0,user,time,app,app_seq,recent_apps,mfu
324648,722,201604250612,237,"[80, 1554, 411, 1889]","[54, 46, 1353, 1466, 1, 392, 80, 1554, 411, 1889]","[1, 237, 547, 439, 1889, 9, 717, 80, 29, 1963]"
275431,667,201604211033,1,"[1, 5, 371, 1]","[1, 1, 1, 72, 1, 1, 1, 5, 371, 1]","[2, 1, 5, 77, 202, 235, 308, 398, 252, 80]"
659336,942,201604210952,384,"[208, 29, 381, 389]","[252, 105, 210, 126, 202, 287, 208, 29, 381, 389]","[2, 1, 5, 77, 252, 10, 202, 4, 398, 9]"
902536,994,201604261718,5,"[379, 70, 688, 586]","[1, 587, 126, 948, 255, 688, 379, 70, 688, 586]","[26, 586, 37, 5, 1, 31, 195, 259, 1243, 184]"
245546,615,201604231510,2,"[77, 77, 2, 202]","[426, 1, 2, 77, 202, 2, 77, 77, 2, 202]","[1, 2, 12, 19, 53, 77, 202, 72, 252, 883]"


### 用MFU对测试集预测，计算Recall@1，Recall@5，Recall@10

In [4]:
correct = [0, 0, 0]
for i in tqdm.tqdm(range(len(df_mfu_test))):
    mfu = df_mfu_test.iloc[i]['mfu']
    app = df_mfu_test.iloc[i]['app']
    if app == mfu[0]:
        correct = [x+1 for x in correct]
    elif app in mfu[:5]:
        correct[1] += 1
        correct[2] += 1
    elif app in mfu:
        correct[2] += 1 
    else:
        pass

total = len(df_mfu_test)
print("Recall@1: {}".format(correct[0] / total))
print("Recall@5: {}".format(correct[1] / total))
print("Recall@10: {}".format(correct[2] / total))

100%|██████████| 181754/181754 [00:39<00:00, 4621.43it/s]

Recall@1: 0.15946829230718443
Recall@5: 0.3928716837043476
Recall@10: 0.5103326474245409



