In [2]:
import pandas as pd
import pickle
import math, os

class ItemCF:
    def __init__(self, train_data_path, test_data_path):
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path

    def _read_data(self, all_data=False):
        try:
            train_data = pd.read_csv(self.train_data_path)
            test_data = pd.read_csv(self.test_data_path)
            print('Data is read successfully!')
        except:
            print('Data read failed!')
            return
        
        self.data = train_data
        if all_data:
            # self.data.append(test_data)
            self.data = pd.concat([self.data, test_data])

        self.data.drop_duplicates(['user_id', 'click_article_id', 'click_timestamp'])

    def _build_user_item_matrix(self):
        def make_user_item(data):
            return list(zip(data['click_article_id'], data['click_timestamp']))
        grouped_data = self.data.groupby('user_id').apply(lambda x: make_user_item(x)).reset_index().rename(columns={0: 'read_history'})
        self.user_item_dict = dict(zip(grouped_data['user_id'], grouped_data['read_history']))

        with open('cache/user_item_dict.pkl', 'wb') as f:
            pickle.dump(self.user_item_dict, f)

        print('Build and save user-item matrix successfully!')
    
    def _build_item_similarity_matrix(self):
        self._read_data()
        self._build_user_item_matrix()

        item_sim = {}
        item_cnt = {}
        for user in self.user_item_dict.keys():
            for ite_i in self.user_item_dict[user]:
                item_sim.setdefault(ite_i[0], {})
                item_cnt.setdefault(ite_i[0], 0)
                item_cnt[ite_i[0]] += 1
                for ite_j in self.user_item_dict[user]:
                    if ite_i[0] != ite_j[0]:
                        item_sim[ite_i[0]].setdefault(ite_j[0], 0)
                        item_sim[ite_i[0]][ite_j[0]] += 1 # 协同过滤这里加的数字可以加上一些修正，这属于协同过滤改进的内容

        for ite_i in item_sim.keys():
            for ite_j in item_sim[ite_i].keys():
                item_sim[ite_i][ite_j] /= math.sqrt(item_cnt[ite_i]) * math.sqrt(item_cnt[ite_j])
        
        with open('cache/item_similarity_dict.pkl', 'wb') as f:
            pickle.dump(item_sim, f)
        
        self.item_sim = item_sim

        print('Build and save item similarity matrix successfully!')
    def _get_hot_items(self):
        self.top_k_hot_items = self.data['click_article_id'].value_counts()[:50]

        with open('cache/top_k_hot_items.pkl', 'wb') as f:
            pickle.dump(self.top_k_hot_items, f)

    def recommend_by_item_CF(self, user_id, recommend_num=5):
        
        cache_files = os.listdir('cache')
        if 'item_similarity_dict.pkl' not in cache_files or 'user_item_dict.pkl' not in cache_files:
            self._build_item_similarity_matrix()
        else:
            with open('cache/user_item_dict.pkl', 'rb') as f:
                self.user_item_dict = pickle.load(f)
                print('Load user_item_dict.pkl !')
            with open('cache/item_similarity_dict.pkl', 'rb') as f:
                self.item_sim = pickle.load(f)
                print('Load item_similarity_dict.pkl !')
            
        if 'top_k_hot_items.pkl' not in cache_files:
            self._get_hot_items()
        else:
            with open('cache/top_k_hot_items.pkl', 'rb') as f:
                self.top_k_hot_items = pickle.load(f)
                print('Load top_k_hot_items.pkl !')

        target_user_id = user_id
        clicked_items = self.user_item_dict[target_user_id]
        related_items = {}
        for ite in clicked_items:
            # related_items.setdefault(ite[0], 0)
            simi_dict = self.item_sim[ite[0]]
            simi_list = list(simi_dict.items())
            simi_list = sorted(simi_list, key=lambda x: x[1], reverse=True)[:recommend_num]
            for ie in simi_list:
                related_items.setdefault(ie[0], 0)
                related_items[ie[0]] += ie[1]
        related_item_list = sorted(list(related_items.items()), key=lambda x: x[1], reverse=True)
        related_item_index_list = [i[0] for i in related_item_list]
        if len(related_item_list) < recommend_num:
            i = 0
            hot_index = list(self.top_k_hot_items.index)
            for ite in hot_index:
                if ite in related_item_index_list:
                    continue
                related_item_list.append((ite, -1)) # 从热门物品里面选的物品相关程度要是-1（或者是任何一个负数就行）
                if len(related_item_list) >= recommend_num:
                    break
        return related_item_list[:recommend_num]


itemCF = ItemCF('/data/zhy/recommendation_system/Rec_sys/data/train_click_log.csv', '/data/zhy/recommendation_system/Rec_sys/data/testA_click_log.csv')
itemCF.recommend_by_item_CF(10025, 10)

Data is read successfully!
Build and save user-item matrix successfully!
Build and save item similarity matrix successfully!


[(363115, 0.42156459187451634),
 (320255, 0.2672612419124244),
 (217803, 0.2672612419124244),
 (160330, 0.2672612419124244),
 (73284, 0.26228527078957187),
 (69226, 0.1889822365046136),
 (73287, 0.1649572197684645),
 (73288, 0.1649572197684645),
 (88468, 0.1543033499620919),
 (298731, 0.1336306209562122)]