In [1]:
import random
import math
import time 
from tqdm.autonotebook import tqdm 

# 定义通用的函数

In [2]:
# 定义监控运行时间的装饰器
def timmer(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        stop_time = time.time()
        print("Func: {:s} | Run time: {:.2f}".format(func.__name__, stop_time-start_time))
        return res 
    return wrapper

# 定义数据集相关的函数

In [5]:
with open("data/Slashdot0902.txt", 'r') as f: 
    for i, line in enumerate(f):
        print(line)
        if i == 8:
            break

# Directed graph (each unordered pair of nodes is saved once): Slashdot0902.txt 

# Slashdot Zoo social network from February 0 2009

# Nodes: 82168 Edges: 948464

# FromNodeId	ToNodeId

0	0

0	1

0	2

0	3

0	4



In [17]:
class Dataset:
    def __init__(self, fp, sample=100000):
        # sample表示只取其中的部分数据
        self.data = self.loadData(fp, sample)
    
    def loadData(self, fp, sample):
        data = [f.strip().split("\t") for f in open(fp).readlines()[4:]]
        if sample == -1: 
            return data 
        else:
            random.shuffle(data)
            return data[:sample]
    
    def splitData(self, M, k, seed=2019):
        train, test = [], []
        random.seed(seed)
        for u, v in self.data:
            if random.randint(0, M-1) == k:
                test.append((u, v))
            else:
                train.append((u, v))
                
        # 处理成字典的形式
        def conver_dict(data):
            data_dict = {}  # 当前用户指向的用户
            data_dict_t = {}  # 指向当前用户的用户
            for u, v in data: 
                if u not in data_dict:
                    data_dict[u] = set()
                data_dict[u].add(v)
                if v not in data_dict_t:
                    data_dict_t[v] = set()
                data_dict_t[v].add(u)

            data_dict = {k:list(data_dict[k]) for k in data_dict}
            data_dict_t = {k:list(data_dict_t[k]) for k in data_dict_t}
            return data_dict, data_dict_t
    
        return conver_dict(train), conver_dict(test)[0]

# 评价指标

In [9]:
class Metric:
    def __init__(self, train, test, GetRecommendation):
        self.train = train
        self.test = test
        self.GetRecommendation = GetRecommendation
        self.recs = self.getRec()
        
    
    # 为test中每个用户进行推荐
    def getRec(self):
        recs = {}
        for user in self.test:
            rank = self.GetRecommendation(user)
            recs[user] = rank
        return recs
    
    # 定义精确率的指标计算方式
    def precision(self):
        all, hit = 0, 0
        for user in self.test:
            test_users = set(self.test[user])
            rank = self.recs[user]
            for v, score in rank: 
                if v in test_users:
                    hit += 1 
            all += len(rank)
        return round(hit/all*100, 2) if all>0 else 0
    
    # 定义召回率的计算方式
    def recall(self):
        all, hit = 0, 0 
        for user in self.test: 
            test_users = set(self.test[user])
            rank = self.recs[user]
            for v, score in rank: 
                if v in test_users:
                    hit += 1 
            all += len(test_users)
        return round(hit/all*100, 2) if all > 0 else 0
    
    def eval(self):
        metric = {"Precision": self.precision(),
                 "Recall": self.recall()}
        print("Metric: ", metric)
        return metric

# 算法的实现

- OUT
- IN
- OUT_IN
- OUT_IN_Cosine

In [21]:
# 1. 利用用户出度计算相似度
def OUT(train, N):
    G, GT = train 
    
    def GetRecommendation(user):
        if user not in G: 
            return []
        # 根据相似度推荐N个未见过的好友
        user_sim = {}
        user_friends = set(G[user])
        ## 对于用户的每一个关注的人（出度）
        for u in G[user]:
            if u not in GT:
                continue
            ## 对于用户关注的人的关注者
            for v in GT[u]:
                if v != user and v not in user_friends:
                    user_sim[v] = user_sim.get(v, 0) + 1 
        
        user_sim = {v:user_sim[v]/math.sqrt(len(G[user])*len(G[v])) for v in user_sim}
        return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N]
    
    return GetRecommendation

In [12]:
# 2. 利用用户的入度计算相似度
def IN(train, N):
    G, GT = train
    def GetRecommendation(user):
        if user not in GT:
            return []
        ## 根据相似度推荐N个未见过的
        user_sim = {}
        user_friends = set(G[user]) if user in G else set()
        ## 对于关注用户user的人
        for u in GT[user]:
            if u not in G: 
                continue
            ## 对于用户u关注的人
            for v in G[u]:
                if v!=user and v not in user_friends:
                    user_sim[v] = user_sim.get(v, 0) + 1 
        user_sim = {v: user_sim[v]/math.sqrt(len(GT[user])*len(GT[v])) for v in user_sim}
        return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N]
    return GetRecommendation

In [13]:
# 3. 利用用户的出度和入度进行计算，不考虑热门入度用户的惩罚
def OUT_IN(train, N):
    G, GT = train 
    def GetRecommendation(user):
        if user not in G: return []
        # 根据相似度推荐N个未见过的
        user_sim = {}
        user_friends = set(G[user])
        ## 对于用户user关注的每一个用户u
        for u in G[user]:
            if u not in G:
                continue
            ## 对于用户u关注的每一个用户v
            for v in G[u]:
                if v != user and v not in user_friends:
                    user_sim[v] = user_sim.get(v, 0) + 1 
        user_sim = {v: user_sim[v] / len(G[user]) for v in user_sim}
        return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))[:N]
    
    return GetRecommendation

In [14]:
# 4. 利用用户的出度和入度的余弦相似度计算
def OUT_IN_Cosine(train, N):
    G, GT = train
    
    def GetRecommendation(user):
        if user not in G: return []
        user_sim = {}
        user_friends = set(G[user])
        for u in G[user]:
            if u not in G: 
                continue
            for v in G[u]:
                if v!= user and v not in user_friends:
                    user_sim[v] = user_sim.get(v, 0) + 1 
        user_sim = {v:user_sim[v]/math.sqrt(len(G[user])*len(GT[v])) for v in user_sim}
        return list(sorted(user_sim.items(), key=lambda x: x[1], reverse=True))
    
    return GetRecommendation

# 实验

In [19]:
class Experiment:
    def __init__(self, M, N, fp="data/Slashdot0902.txt", rt="OUT"):
        self.M = M 
        self.N = N 
        self.fp = fp 
        self.rt = rt 
        self.alg = {"OUT": OUT, "IN":IN, 
                   "OUT_IN": OUT_IN, "OUT_IN_Cosine": OUT_IN_Cosine}
        
    # 定义单词实验
    def worker(self,train,test):
        getRecommendation = self.alg[self.rt](train, self.N)
        metric = Metric(train[0], test, getRecommendation)
        return metric.eval()
    
    @timmer
    def run(self):
        metrics = {"Precision": 0, "Recall": 0}
        dataset = Dataset(self.fp)
        for ii in range(self.M):
            train, test = dataset.splitData(self.M, ii)
            print(f"Experiment: {ii}")
            metric = self.worker(train, test)
            metrics = {k: metrics[k] + metric[k] for k in metrics}
        
        metrics = {k: metrics[k]/self.M for k in metrics}
        print("Average Result (M={}, N={}, alg={}): {}".format(self.M, self.N, self.rt, metrics))

In [22]:
# Slashdot数据集
M, N = 10, 10 
for alg in ["OUT", "IN",  "OUT_IN", "OUT_IN_Cosine"]:
    print(f"当前使用的算法是: {alg}")
    exp = Experiment(M, N, rt=alg)
    exp.run()

当前使用的算法是: OUT
Experiment: 0
Metric:  {'Precision': 0.08, 'Recall': 0.35}
Experiment: 1
Metric:  {'Precision': 0.07, 'Recall': 0.32}
Experiment: 2
Metric:  {'Precision': 0.07, 'Recall': 0.3}
Experiment: 3
Metric:  {'Precision': 0.08, 'Recall': 0.35}
Experiment: 4
Metric:  {'Precision': 0.07, 'Recall': 0.31}
Experiment: 5
Metric:  {'Precision': 0.07, 'Recall': 0.3}
Experiment: 6
Metric:  {'Precision': 0.1, 'Recall': 0.45}
Experiment: 7
Metric:  {'Precision': 0.06, 'Recall': 0.27}
Experiment: 8
Metric:  {'Precision': 0.11, 'Recall': 0.46}
Experiment: 9
Metric:  {'Precision': 0.09, 'Recall': 0.41}
Average Result (M=10, N=10, alg=OUT): {'Precision': 0.08, 'Recall': 0.352}
Func: run | Run time: 9.94
当前使用的算法是: IN
Experiment: 0
Metric:  {'Precision': 0.07, 'Recall': 0.3}
Experiment: 1
Metric:  {'Precision': 0.08, 'Recall': 0.35}
Experiment: 2
Metric:  {'Precision': 0.08, 'Recall': 0.34}
Experiment: 3
Metric:  {'Precision': 0.07, 'Recall': 0.29}
Experiment: 4
Metric:  {'Precision': 0.07, 'Recal

In [23]:
# 2. Epinions
M, N = 10, 10
for alg in ["OUT", "IN",  "OUT_IN", "OUT_IN_Cosine"]:
    print(f"当前使用的算法是: {alg}")
    exp = Experiment(M, N, fp="data/soc-Epinions1.txt", rt=alg)
    exp.run()

当前使用的算法是: OUT
Experiment: 0
Metric:  {'Precision': 0.16, 'Recall': 0.68}
Experiment: 1
Metric:  {'Precision': 0.21, 'Recall': 0.92}
Experiment: 2
Metric:  {'Precision': 0.18, 'Recall': 0.79}
Experiment: 3
Metric:  {'Precision': 0.2, 'Recall': 0.86}
Experiment: 4
Metric:  {'Precision': 0.2, 'Recall': 0.84}
Experiment: 5
Metric:  {'Precision': 0.18, 'Recall': 0.78}
Experiment: 6
Metric:  {'Precision': 0.19, 'Recall': 0.83}
Experiment: 7
Metric:  {'Precision': 0.19, 'Recall': 0.79}
Experiment: 8
Metric:  {'Precision': 0.19, 'Recall': 0.81}
Experiment: 9
Metric:  {'Precision': 0.19, 'Recall': 0.82}
Average Result (M=10, N=10, alg=OUT): {'Precision': 0.18899999999999997, 'Recall': 0.812}
Func: run | Run time: 15.14
当前使用的算法是: IN
Experiment: 0
Metric:  {'Precision': 0.24, 'Recall': 0.91}
Experiment: 1
Metric:  {'Precision': 0.23, 'Recall': 0.87}
Experiment: 2
Metric:  {'Precision': 0.24, 'Recall': 0.94}
Experiment: 3
Metric:  {'Precision': 0.25, 'Recall': 0.95}
Experiment: 4
Metric:  {'Precis