# 针对Delicious数据集，对SimpleTagBased算法进行改进（使用NormTagBased、TagBased-TFIDF算法）

In [1]:
import pandas as pd
import math
import random
import operator
from tqdm import tqdm

In [2]:
data_file = "user_taggedbookmarks-timestamps.dat"
data = pd.read_csv(data_file, sep = '\t')
data.head()

Unnamed: 0,userID,bookmarkID,tagID,timestamp
0,8,1,1,1289255362000
1,8,2,1,1289255159000
2,8,7,1,1289238901000
3,8,7,6,1289238901000
4,8,7,7,1289238901000


In [3]:
# type: dict，save tag for user to item，exp: {userid: {item1:[tag1, tag2], ...}}
records = {}
# define train, test dataset
train_data = {}
test_data = {}
# user tag, item tag
user_tags = {}
tag_items = {}
user_items = {}
# additionnal dict
items_tags = {}
tag_users = {}
items_users = {}

In [4]:
def load_data(file_name):
    print("Begin load data.")
    data = pd.read_csv(data_file, sep = '\t')
    for i in tqdm(range(len(data))):
        uid = data['userID'][i]
        iid = data['bookmarkID'][i]
        tag = data['tagID'][i]
        records.setdefault(uid, {})
        records[uid].setdefault(iid, [])
        records[uid][iid].append(tag)
    print("dataset size: %d." %(len(data)))
    print("number of peolple set tag: %d" % (len(records)))
    print("load data complete.")
load_data(data_file)

Begin load data.


100%|██████████| 437593/437593 [00:22<00:00, 19095.90it/s]

dataset size: 437593.
number of peolple set tag: 1867
load data complete.





In [5]:
def train_test_split(ratio = 0.2, seed = 123):
    random.seed(seed)
    for u in tqdm(records.keys()):
        for i in records[u].keys():
            if random.random() < ratio:
                test_data.setdefault(u, {})
                test_data[u].setdefault(i, [])
                for t in records[u][i]:
                    test_data[u][i].append(t)
            else:
                train_data.setdefault(u,{})
                train_data[u].setdefault(i,[])
                for t in records[u][i]:
                    train_data[u][i].append(t)
                    
    print("train sample: %d, test sample: %d" % (len(train_data), len(test_data)))
train_test_split(0.2)

100%|██████████| 1867/1867 [00:00<00:00, 5529.92it/s]

train sample: 1859, test sample: 1798





In [6]:
# set matrix mat[index, item] = 1
def addValueToMat(mat, index, item, value=1):
    if index not in mat:
        mat.setdefault(index,{})
        mat[index].setdefault(item,value)
    else:
        if item not in mat[index]:
            mat[index][item] = value
        else:
            mat[index][item] += value

        

In [8]:
def initStat():
    records = train_data
    for u, items in tqdm(records.items()):
        for i, tags in items.items():
            for tag in tags:
                # relation between user and tag
                addValueToMat(user_tags, u, tag, 1)
                # relation between tag and item
                addValueToMat(tag_items, tag, i, 1)
                # relation between user and item
                addValueToMat(user_items, u, i, 1)
                # relation between item and tag
                addValueToMat(items_tags, i, tag, 1)
                # relation between tag and user
                addValueToMat(tag_users, tag, u, 1)
                # relation between item and user
                addValueToMat(items_users, i, u, 1)
    print("user_tags, tag_items, user_items initialize.")
    print("user_tags: %d, tag_items: %d, user_items: %d" % (len(user_tags), len(tag_items), len(user_items)))
initStat()

100%|██████████| 1859/1859 [00:01<00:00, 936.33it/s] 

user_tags, tag_items, user_items initialize.
user_tags: 1859, tag_items: 36811, user_items: 1859





In [9]:
# recommend top N tag for user
def recommend(user, N):
    recommend_items = {}
    tagged_items = user_items[user]
    for tag, wut in user_tags[user].items():
        for item, wti in tag_items[tag].items():
            if item in tagged_items:
                continue
            # NormTagBased-1 algorithm
            #norm = len(tag_users[tag].items()) * len(user_tags[user.items()])
            # TagBased-IDF algorithm
            norm = math.log(len(tag_users[tag].items()) + 1)
            
#             if item not in recommend_items:
#                 recommend_items[item] = wut * wti
#             else:
#                 recommend_items[item] = recommend_items[item] + wut * wti
            if item not in recommend_items:
                recommend_items[item] = wut * wti / norm
            else:
                recommend_items[item] = recommend_items[item] + wut * wti / norm
    return sorted(recommend_items.items(), key = operator.itemgetter(1), reverse = True)[0:N]


In [10]:
# calculate presicion and recall
def precisionAndRecall(N):
    hit = 0
    h_recall = 0
    h_precision = 0
    for user, items in tqdm(test_data.items()):
        if user not in train_data:
            continue
        rank = recommend(user, N)
        for item, rui in rank:
            if item in items:
                hit = hit + 1
        h_recall = h_recall + len(items)
        h_precision = h_precision + N
        
    return (hit/(h_precision*1.0)), (hit/(h_recall*1.0))

In [11]:

def testRecommend():
    print("asscess recommend result")
    print("%3s %10s %10s" % ('N',"precision",'recall'))
    for n in [5,10,20,40,60,80,100]:
        precision,recall = precisionAndRecall(n)
        print("%3d %10.3f%% %10.3f%%" % (n, precision * 100, recall * 100))

In [12]:
testRecommend()

  0%|          | 0/1798 [00:00<?, ?it/s]

asscess recommend result
  N  precision     recall


100%|██████████| 1798/1798 [01:14<00:00, 24.18it/s]
  0%|          | 4/1798 [00:00<01:03, 28.18it/s]

  5      0.894%      0.381%


100%|██████████| 1798/1798 [01:10<00:00, 25.52it/s]
  0%|          | 4/1798 [00:00<01:14, 24.11it/s]

 10      0.721%      0.614%


100%|██████████| 1798/1798 [01:12<00:00, 24.71it/s]
  0%|          | 4/1798 [00:00<00:52, 34.18it/s]

 20      0.570%      0.970%


100%|██████████| 1798/1798 [01:14<00:00, 24.12it/s]
  0%|          | 4/1798 [00:00<00:59, 29.91it/s]

 40      0.433%      1.475%


100%|██████████| 1798/1798 [01:10<00:00, 25.55it/s]
  0%|          | 4/1798 [00:00<01:04, 27.99it/s]

 60      0.346%      1.769%


100%|██████████| 1798/1798 [01:10<00:00, 25.55it/s]
  0%|          | 4/1798 [00:00<00:48, 36.92it/s]

 80      0.298%      2.031%


100%|██████████| 1798/1798 [01:14<00:00, 24.20it/s]

100      0.268%      2.283%



