In [1]:
import ast 
import pandas as pd

df_usage = pd.read_csv('data/baseline.txt', sep='\t')
df_usage['app_seq'] = df_usage['app_seq'].apply(ast.literal_eval)
df_usage['recent_apps'] = df_usage['recent_apps'].apply(ast.literal_eval)
df_usage['time'] = df_usage['time'].apply(lambda x: str(x))

### 切分训练测试集

In [2]:
from sklearn.model_selection import train_test_split

df_mc = df_usage.copy()
df_mc['time'] = df_mc['time'].apply(int)
train, test = train_test_split(df_mc, test_size=0.2, random_state=2021, stratify=df_mc['user'])

### 计算{用户：app}字典

In [3]:
user_apps = df_usage.groupby('user')['app'].apply(set).apply(list).reset_index()
user_app_dict = dict(zip(user_apps['user'], user_apps['app']))

### 用MarkovChain训练，并对测试集预测，
### 计算Recall@1，Recall@5，Recall@10

In [4]:
import tqdm
import random
import numpy as np

chain = {}

users = df_mc['user'].unique()

correct = [0, 0, 0]

for user in tqdm.tqdm(users):
    chain[user] = {}

    # make markov chain
    train_per_user = train[train['user']==user]
    for i in range(len(train_per_user)):
        current = train_per_user.iloc[i]['app']
        last = train_per_user.iloc[i]['app_seq'][-1]

        if last not in chain[user]:
            chain[user].update({last: {current: 1}})
            continue
            
        if current not in chain[user][last]:
            chain[user][last].update({current: 1})
        else:
            chain[user][last].update({current: chain[user][last][current] + 1})

    # normalization, 得到转移概率
    for app, transition in chain[user].items():
        transition = dict((key, value / sum(transition.values())) for key, value in transition.items())
        chain[user][app] = transition
    
    # test
    test_per_user = test[test['user'] == user]

    for i in range(len(test_per_user)):
        last = test_per_user.iloc[i]['app_seq'][-1]

        if last not in chain[user]:
            preds = random.sample(user_app_dict[user], min(len(user_app_dict[user]), 10))
        else:
            transitions = chain[user][last]
            preds = np.random.choice(list(transitions.keys()), size=min(10, len(transitions)), replace=False, p=list(transitions.values()))
            preds = preds.tolist()
        
        answer = test_per_user.iloc[i]['app']

        if answer == preds[0]:
            correct[0] += 1
            correct[1] += 1
            correct[2] += 1
        elif answer in preds[:5]:
            correct[1] += 1
            correct[2] += 1
        elif answer in preds:
            correct[2] += 1
        else:
            pass
    
print(correct)
print("Recall@1: {}".format(correct[0] / len(test)))
print("Recall@5: {}".format(correct[1] / len(test)))
print("Recall@10: {}".format(correct[2] / len(test)))

100%|██████████| 748/748 [03:56<00:00,  3.16it/s]

[31330, 82397, 101801]
Recall@1: 0.17237584867458214
Recall@5: 0.45334353026618396
Recall@10: 0.5601032164354017



