In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
# mixed recommendation: user based collaborative filtering and content based recommendation
class UBFCBR:
    def __init__(self, filepath, k=10, n=10):
        self.filepath = filepath
        self.k = k
        self.n = n

        self.train, self.test = self.load_data()

        self.train_data, self.test_data = self.split_data()
        self.user_sim_matrix = self.user_similarity()
        self.cbr_user_item_sim = self.cbr_user_item_sim()

        self.recommendation = self.recommend()


    # load data
    def load_data(self):
        print("加载数据...")
        rating = pd.read_csv(self.filepath)
        train, test = train_test_split(rating, test_size=0.2, random_state=42)
        train_dict = train.iloc[:,:3].to_dict('split')
        train_dict = train_dict['data']


        return train, test

    # split data
    def split_data(self):
        print("分割数据...")
        train = self.train.iloc[:,:3].to_dict('split')
        train  = train['data']

        test = self.test.iloc[:,:3].to_dict('split')
        test  = test['data']

        train_data = {}
        test_data = {}
        # control the random seed
        np.random.seed(1)
        for user, item, rating in train:
            if user not in train_data:
                train_data[user] = {}
            train_data[user][item] = rating
        for user, item, rating in test:
            if user not in test_data:
                test_data[user] = {}
            test_data[user][item] = rating
        return train_data, test_data

    # calculate the similarity between users with cosine similarity using train data
    def user_similarity(self):
        print("计算用户相似度矩阵...")
        # build the inverse table for item_users
        item_users = dict()
        for user, items in self.train_data.items():
            for item in items.keys():
                if item not in item_users:
                    item_users[item] = set()
                item_users[item].add(user)
        # calculate co-rated items between users
        C = dict()
        N = dict()
        for item, users in item_users.items():
            for u in users:
                if u not in N:
                    N[u] = 0
                N[u] += 1
                for v in users:
                    if u == v:
                        continue
                    if u not in C:
                        C[u] = {}
                    if v not in C[u]:
                        C[u][v] = 0
                    C[u][v] += 1
        # calculate finial similarity matrix W
        W = dict()
        for u, related_users in C.items():
            if u not in W:
                W[u] = {}
            for v, cuv in related_users.items():
                W[u][v] = cuv / np.sqrt(N[u] * N[v])
        return W

    def cbr_user_item_sim(self):
        movies = pd.read_csv('ml-latest-small/movies.csv')
        ratings = pd.read_csv('ml-latest-small/ratings.csv')

        # split ratings into train and test
        train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

        # transfer movies genre to 0-1 matrix based on genre
        movies['genres'] = movies['genres'].str.split('|')
        movies = movies.join(movies.genres.str.join('|').str.get_dummies())

        # lookup users watched movie and their genre, then sum up
        user_genre = train_data.groupby('userId').apply(lambda x: x.merge(movies, on='movieId', how='left').iloc[:, 3:].sum())

        df = ratings.merge(movies, on='movieId', how='left')
        for i in range(len(df)):
            df.iloc[i, 6:] = df.iloc[i, 6:] * df.iloc[i, 2]

        df_sum = df.groupby('userId').apply(lambda x: x.iloc[:, 6:].sum())
        user_avg = ratings.groupby('userId')['rating'].mean()

        user_pref = df_sum.copy()
        for i in range(len(user_pref)):
            user_pref.iloc[i, :] = (df_sum.iloc[i, :] - (user_avg[i+1] * user_genre.iloc[i, 3:])) / user_genre.iloc[i, 3:]
        user_pref = user_pref.fillna(0)
        user_pref = user_pref.replace(np.inf, 0)

        # calculate cosine similarity between users and movies based on 1-0 matrix
        user_item_sim = cosine_similarity(user_pref, movies.iloc[:, 3:])

        # transfer train data to dictionary
        train_data = train_data.groupby('userId')['movieId'].apply(list).to_dict()

        # look up user's top 10 similar movies and return in a dictionary
        user_item_sim = pd.DataFrame(user_item_sim, index=user_genre.index, columns=movies['movieId'])

        # normalization of user_item_sim
        user_item_sim = user_item_sim.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x)) * 5), axis=1)

        # transfer user_item_sim to dictionary
        user_item_sim = user_item_sim.to_dict('index')

        return user_item_sim

    # recommend items for each user
    def recommend(self, wht=0.8):
        print("为每个用户推荐物品...")
        rank = dict()
        cbr_recommend = dict()
        recommendation = dict()
        for user in self.train_data.keys():
            rank[user] = dict()
            interacted_items = self.train_data[user]
            for v, wuv in sorted(self.user_sim_matrix[user].items(), key=lambda x: x[1], reverse=True)[0:self.k]:
                for i, rvi in self.train_data[v].items():
                    if i in interacted_items:
                        continue
                    if i not in rank[user]:
                        rank[user][i] = 0
                    rank[user][i] += wuv * rvi
            max_score = max(rank[user].values())
            min_score = min(rank[user].values())
            for i in rank[user].keys():
                rank[user][i] = ((rank[user][i] - min_score) / (max_score - min_score)) * 5

            cbr_recommend[user] = dict()
            for v, j in self.cbr_user_item_sim[user].items():
                if v not in interacted_items:
                    cbr_recommend[user][v] = j

        for u, i in cbr_recommend.items():
            recommendation[u] = dict()
            for v, j in cbr_recommend[u].items():
                try:
                    recommendation[u][v] = rank[u][v] * wht + j * (1 - wht)
                except:
                    recommendation[u][v] = j
        return recommendation

    # calculate the precision, recall, mae
    def evaluate(self, nitems):
        print("评估模型...")
        hit = 0
        mae = 0
        n_recall = 0
        n_precision = 0
        n_mae = 0
        for user in self.train_data.keys():
            test_items = self.test_data.get(user, {})
            rank = self.recommendation[user]
            for item, w in sorted(rank.items(), key=lambda x: x[1], reverse=True)[0:nitems]:
                if item in test_items:
                    hit += 1
                    mae += abs(w - test_items[item])
                    n_mae += 1
            n_recall += len(test_items)
            n_precision += nitems
        return hit / (1.0 * n_precision), hit / (1.0 * n_recall), mae / (1.0 * n_mae)

In [22]:
ubfcbr = UBFCBR('ml-latest-small/ratings.csv')

加载数据...
分割数据...
计算用户相似度矩阵...
为每个用户推荐物品...


In [23]:
ubfcbr.evaluate(10)

评估模型...


(0.03360655737704918, 0.010164617215390718, 0.7121503132430916)

In [None]:
n = [5, 10, 15, 20, 25, 30]
precision_ubfcbr = []
recall_ubfcbr = []
mae_ubfcbr = []
precision_ubf = []
recall_ubf = []
mae_ubf = []
# precision_uibf = []
# recall_uibf = []
# mae_uibf = []

for i in n:
    precision, recall, mae = ubfcbr.evaluate(i)
    precision_ibf.append(precision)
    recall_ibf.append(recall)
    mae_ibf.append(mae)

    precision, recall, mae = ubf.evaluate(i)
    precision_ubf.append(precision)
    recall_ubf.append(recall)
    mae_ubf.append(mae)

    # precision, recall, mae = uibf.evaluate(i)
    # precision_uibf.append(precision)
    # recall_uibf.append(recall)
    # mae_uibf.append(mae)