In [1]:
import ast 
import pandas as pd

df_usage = pd.read_csv('data/baseline.txt', sep='\t')
df_usage['app_seq'] = df_usage['app_seq'].apply(ast.literal_eval)
df_usage['recent_apps'] = df_usage['recent_apps'].apply(ast.literal_eval)
df_usage['time'] = df_usage['time'].apply(lambda x: str(x))

### 计算朴素贝叶斯的输入特征

In [3]:
import datetime

df_nb = df_usage.copy()

def prep_time(t):
    t = t[:-2]
    weekday = datetime.datetime.strptime(t[:-2], '%Y%m%d').weekday()
    if weekday >= 5:
        weekday = '1'
    else:
        weekday = '0'
    return '{}_{}'.format(weekday, t[-2:])

df_nb['time'] = df_nb['time'].apply(lambda x: prep_time(x))

app_seq_str = df_nb['app_seq'].apply(lambda app_seq: ' '.join([str(app) for app in app_seq]))
df_nb['nb_input'] = df_nb['time'] + ' ' + app_seq_str

df_nb.head()

Unnamed: 0,user,time,app,app_seq,recent_apps,nb_input
0,0,0_08,612,"[361, 361, 31, 360]","[361, 361, 31, 360]",0_08 361 361 31 360
1,0,0_08,31,"[361, 31, 360, 612]","[361, 361, 31, 360, 612]",0_08 361 31 360 612
2,0,0_08,360,"[31, 360, 612, 31]","[361, 361, 31, 360, 612, 31]",0_08 31 360 612 31
3,0,0_08,361,"[360, 612, 31, 360]","[361, 361, 31, 360, 612, 31, 360]",0_08 360 612 31 360
4,0,0_08,1,"[612, 31, 360, 361]","[361, 361, 31, 360, 612, 31, 360, 361]",0_08 612 31 360 361


### 切分训练/测试集

In [9]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_nb, test_size=0.2, random_state=2021, stratify=df_nb['user'])

In [11]:
# 8/2
print(train.shape)
print(test.shape)

(727016, 6)
(181754, 6)


### 用朴素贝叶斯分类器训练，并对测试集预测，
### 计算Recall@1，Recall@5，Recall@10

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

users = df_nb['user'].unique()
correct = [0, 0, 0]

for user in users:
    df_train = train[train['user']==user]
    df_test = test[test['user']==user]

    train_x = df_train['nb_input'].values.tolist()
    train_y = df_train['app'].values.tolist()

    cv = CountVectorizer()
    cv_train_x = cv.fit_transform(train_x)
    NBClassifier = MultinomialNB()
    NBClassifier.fit(cv_train_x, train_y)

    test_x = df_test['nb_input'].values.tolist()
    test_y = df_test['app'].values.tolist()

    cv_test_x = cv.transform(test_x)
    probs = NBClassifier.predict_proba(cv_test_x)
    topn = np.argsort(probs, axis=1)[:, -10:]
    topn = np.flip(topn, axis=1)    # 左右翻转数组
    topn = NBClassifier.classes_[topn]

    for answer, predictions in zip(test_y, topn):
        if answer == predictions[0]:
            correct[0] += 1
            correct[1] += 1
            correct[2] += 1
        elif answer in predictions[:5]:
            correct[1] += 1
            correct[2] += 1
        elif answer in predictions:
            correct[2] += 1
        else:
            pass

print(correct)
print("Recall@1: {}".format(correct[0] / len(test)))
print("Recall@5: {}".format(correct[1] / len(test)))
print("Recall@10: {}".format(correct[2] / len(test)))

[44903, 101800, 124394]
Recall@1: 0.24705370995961576
Recall@5: 0.5600977144932161
Recall@10: 0.6844085962344708
