# User-Based CF

In [1]:
import numpy as np
import pandas as pd
from math import sqrt

In [9]:
class Recommender:

    # data: 数据集，这里指users_rating
    # k: 表示得出最相近的k的近邻
    # sim_func: 表示使用计算相似度
    # n: 表示推荐的item的个数

    def __init__(self, data, k = 3, sim_func='pearson', n=12):

        # 数据初始化
        self.k = k
        self.n = n
        self.sim_func = sim_func
        if self.sim_func == 'pearson':
            self.fn = self.pearson_sim
        if type(data).__name__ == 'dict':
            self.data = data

    #pearson相似度
    def pearson_sim(self, rating1, rating2):
        sum_x = 0
        sum_y = 0
        sum_xy = 0
        sum_x2 = 0
        sum_y2 = 0
        n = 0
        for key in rating1:
            if key in rating2:
                n += 1
                x = rating1[key]
                y = rating2[key]
                sum_x += x
                sum_y += y
                sum_xy += x * y
                sum_x2 += pow(x, 2)
                sum_y2 += pow(y, 2)
        if n == 0:
            return 0

        dinominator = sqrt(n * sum_x2 - pow(sum_x, 2)) * sqrt(n * sum_y2 - pow(sum_y, 2))
        if dinominator == 0:
            return 0
        else:
            return (n * sum_xy - sum_x * sum_y) / dinominator

    #对用户相似度排序
    def user_sim_sort(self, user_id):
        distances = []
        for instance in self.data:
            if instance != user_id:
                dis = self.fn(self.data[user_id], self.data[instance])
                distances.append((instance, dis))

        distances.sort(key=lambda items: items[1], reverse=True)  # Similarity is a better name. Higher similarty means more similar between two users.
        return distances

    # recommand主体函数
    def recommand(self, user_id):
        # 定义一个字典，用来存储推荐的电影和分数
        recommendations = {}
        # 计算出user与其它所有用户的相似度，返回一个list
        user_sim = self.user_sim_sort(user_id)
        # 计算最近的k个近邻的总距离
        total_dis = 0.0
        for i in range(self.k):
            total_dis += user_sim[i][1]
        if total_dis == 0.0:
            total_dis = 1.0
        
        # 将与user最相近的k个人中user没有看过的书推荐给user，并且这里又做了一个分数的计算排名
        for i in range(self.k):
            # 第i个人的id
            neighbor_id = user_sim[i][0]
            # 第i个人与user的相似度转换到[0, 1]之间
            weight = user_sim[i][1] / total_dis
            # 第i个用户看过的书和相应的打分
            neighbor_ratings = self.data[neighbor_id]
            user_rating = self.data[user_id]

            for item_id in neighbor_ratings:
                if item_id not in user_rating:
                    if item_id not in recommendations:
                        recommendations[item_id] = neighbor_ratings[item_id] * weight
                    else:
                        recommendations[item_id] = recommendations[item_id] + neighbor_ratings[item_id] * weight
        recommendations = list(recommendations.items())

        # 做了一个排序
        recommendations.sort(key=lambda items: items[1], reverse=True)

        return recommendations[:self.n], user_sim

# if __name__ == "__main__":

#     # 获取数据
#     users_rating = dict()
#     data_path = "./ratings.csv"
#     with open(data_path, 'r') as file:
#         for line in file:
#             items = line.strip().split(',')
#             if items[0] not in users_rating:
#                 users_rating[items[0]] = dict()
#             users_rating[items[0]][items[1]] = dict()
#             users_rating[items[0]][items[1]] = float(items[2])

In [11]:
users_rating = dict()
data_path = "./ratings.csv"
with open(data_path, 'r') as file:
    next(file)
    for line in file:
        items = line.strip().split(',')
        if items[0] not in users_rating:
            users_rating[items[0]] = dict()
        users_rating[items[0]][items[1]] = float(items[2])

In [12]:
print(len(users_rating))

138493


In [21]:
user_id = '1'
recomm = Recommender(users_rating)
recommendations, user_sim = recomm.recommand(user_id)
print("\nmovie id list:", recommendations)
print("\nnear list:", user_sim[:15])
print()
print(len(user_sim))


movie id list: [('805', 3.0), ('34', 2.333333333333333), ('39', 1.6666666666666665), ('1213', 1.6666666666666665), ('1885', 1.6666666666666665), ('3751', 1.6666666666666665), ('36', 1.6666666666666665), ('62', 1.6666666666666665), ('86', 1.6666666666666665), ('279', 1.3333333333333333), ('314', 1.3333333333333333), ('356', 1.3333333333333333)]

near list: [('179', 1.0000000000000002), ('1132', 1.0000000000000002), ('1329', 1.0000000000000002), ('1473', 1.0000000000000002), ('1607', 1.0000000000000002), ('1660', 1.0000000000000002), ('2288', 1.0000000000000002), ('2913', 1.0000000000000002), ('2944', 1.0000000000000002), ('3298', 1.0000000000000002), ('4981', 1.0000000000000002), ('5184', 1.0000000000000002), ('6310', 1.0000000000000002), ('7711', 1.0000000000000002), ('7739', 1.0000000000000002)]

138492


In [22]:
# users_rating['138493']

Dictionary users_rating looks like:
```
{'1': {'2': 3.5, '29': 3.5, '32': 3.5, '47': 3.5, ...},
 '2': {'3': 4.0, '62': 5.0, '70': 5.0, '110': 4.0, ...},
 '3': {'1': 4.0, '24': 3.0, '32': 4.0, '50': 5.0, ...},
 ...
 '138493': {'1': 3.5, '2': 4.0, '18': 4.5, '19': 4.0, ...}}
 ```

1. Define k and n (Initialize k=3, n=12), similarity func is "Pearson similarity".(皮尔逊相关系数/标准化协方差，[-1,1], 值越大 代表两变量越相似)
2. Choose a certain user with id, such as '1', calculate 138492 sets of similarty between user '1' and other 138492 users.
    - Call the function **user_sim_sort()** and function **pearson_sim()**.
    - These functions return a sorted list of similarity from highest to lowest between '1' and other 138492 users.
    
3. The func recommand() is the main function of class Recommender.
    - It picks out the **top k** users who are most similar to the current user '1' based on the sorted list returned from **user_sim_sort()** func. <u>Default k is 3</u>
    - Read the score list of this k most similar users of user '1'. 
    - Find out those items that are not in '1' user's score list but in those other k similar users. Because these new items for user '1' is what we need to recommend to '1'.
    - Sort and Select top n best items to recommend to user '1'. <u>Default n is 12</u>

# Item-Based CF

In [1]:
from math import sqrt

In [2]:
class ItemBasedCF:
    def __init__(self, train_file):
        self.train_file = train_file
        self.read_data()

    # 读取文件，并生成用户-物品的评分表和测试集
    def read_data(self):
        self.train = dict()
        line_num = 0
        for line in open(self.train_file):
            if line_num == 0:
                line_num+=1
                continue
            user_id, item_id, score, _ = line.strip().split(',')
            self.train.setdefault(user_id, {})
            self.train[user_id][item_id] = int(float(score))


    # 建立物品-物品的共现矩阵
    def item_sim(self):
        C = dict()  #物品-物品的共现矩阵
        N = dict()  #物品被多少个不同用户购买
        for user, items in self.train.items():
            for i in items.keys():
                N.setdefault(i, 0)
                N[i] += 1
                C.setdefault(i, {})
                for j in items.keys():
                    if i == j :
                        continue
                    if j not in C[i].keys():
                        C[i].setdefault(j, 0)
                    C[i][j] += 1

        #计算相似度矩阵
        self.W = dict()
        for i,related_items in C.items():
            self.W.setdefault(i,{})
            for j,cij in related_items.items():
                # 余弦相似度
                self.W[i][j] = cij / (sqrt(N[i] * N[j]))
        return self.W

   #给用户user推荐，前K个相关用户
    def recommend(self,user,K=3,N=10):
        rank = dict()
        action_item = self.train[user]     #用户user产生过行为的item和评分
        for item,score in action_item.items():
            for j,wj in sorted(self.W[item].items(),key=lambda x:x[1],reverse=True)[0:K]:
                if j in action_item.keys():
                    continue
                if j not in rank.keys():
                    rank.setdefault(j,0)
                rank[j] += score * wj
        return dict(sorted(rank.items(),key=lambda x:x[1],reverse=True)[0:N])

# if __name__ == "__main__":

#     CF = ItemBasedCF('./ratings.csv')
#     CF.item_sim()
#     recomm_dic = CF.recommend('1')

#     for k,v in recomm_dic.iteritems():
#         print(k,"\t",v)

In [1]:
# Read Data
train = dict()
data_path = "./ratings.csv"
line_num = 0
for line in open(data_path):
    if line_num == 0: # title
        line_num+=1
        continue
    user_id, item_id, score, _ = line.strip().split(',')
    train.setdefault(user_id, {})
    train[user_id][item_id] = float(score)