In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import random
from copy import deepcopy


In [6]:
class DataGenerator(object):
    """Construct dataset for deep learning project"""
    def __init__(self, ratings):
        """
        args:
            ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
        """
        assert 'userId' in ratings.columns
        assert 'itemId' in ratings.columns
        assert 'rating' in ratings.columns

        self.ratings = ratings
        preprocess_ratings = self._binarize(ratings)
        self.user_pool = set(self.ratings['userId'].unique())
        self.item_pool = set(self.ratings['itemId'].unique())
        # create negative item samples for Mem learning
        self.negatives = self._sample_negative(ratings)
        self.train_ratings, self.test_ratings = self._train_test_split_loo(preprocess_ratings)

    
    def _binarize(self, ratings):
        """binarize into 0 or 1, imlicit feedback"""
        ratings = deepcopy(ratings)
        ratings['rating'][ratings['rating'] > 0] = 1.0
        return ratings

    def _train_test_split_loo(self, ratings):
        """leave one out train/test split """
        ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
        # Test sample are the ones with the heighest time stamp
        test = ratings[ratings['rank_latest'] == 1]
        # all others are in the traingis set
        train = ratings[ratings['rank_latest'] > 1]
        # Each user should at least have rated x samples => both sets should contain the same userIds
        assert train['userId'].nunique() == test['userId'].nunique()
        return train[['userId', 'itemId', 'rating']], test[['userId', 'itemId', 'rating']]

    def _sample_negative(self, ratings):
        """return all negative items & 100 sampled negative items"""
        # Creates for each unique user a set of items that he interacted with
        interact_status = ratings.groupby('userId')['itemId'].apply(set).reset_index().rename(
            columns={'itemId': 'interacted_items'})
        interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
        interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))
        return interact_status[['userId', 'negative_items', 'negative_samples']]

    def save(self, filename, format="CMN"):
        """
        CMN format required
        - train_data.npy
            [[user id, item id], ...]
        - test_data.npy
            {userid: (pos_id, [neg_id1, neg_id2, ..., neg_id100])}
        """
        if format == "CMN":
            train_data = self.train_ratings[["userId","itemId"]].to_numpy()
            test_data = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
            test_data = test_data[["userId","itemId","negative_samples"]]
            #test_data = test_data.apply(lambda r: {r["userId"]: (r['itemId'], r["negative_samples"])}, axis=1).to_numpy()
            test_data = dict([(i,(a, b)) for i, a,b in zip(test_data.userId, test_data.itemId, test_data.negative_samples)])
            np.savez(filename, train_data=train_data, test_data=test_data)
            
        elif format == "NCF":
            pass
    # def instance_a_train_loader(self, num_negatives, batch_size):
    #     """instance train loader for one training epoch"""
    #     users, items, ratings = [], [], []
    #     train_ratings = pd.merge(self.train_ratings, self.negatives[['userId', 'negative_items']], on='userId')
    #     train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, num_negatives))
    #     for row in train_ratings.itertuples():
    #         users.append(int(row.userId))
    #         items.append(int(row.itemId))
    #         ratings.append(float(row.rating))
    #         for i in range(num_negatives):
    #             users.append(int(row.userId))
    #             items.append(int(row.negatives[i]))
    #             ratings.append(float(0))  # negative samples get 0 rating
    #     dataset = UserItemRatingDataset(user_tensor=torch.LongTensor(users),
    #                                     item_tensor=torch.LongTensor(items),
    #                                     target_tensor=torch.FloatTensor(ratings))
    #     return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # @property
    # def evaluate_data(self):
    #     """create evaluate data"""
    #     test_ratings = pd.merge(self.test_ratings, self.negatives[['userId', 'negative_samples']], on='userId')
    #     test_users, test_items, negative_users, negative_items = [], [], [], []
    #     for row in test_ratings.itertuples():
    #         test_users.append(int(row.userId))
    #         test_items.append(int(row.itemId))
    #         for i in range(len(row.negative_samples)):
    #             negative_users.append(int(row.userId))
    #             negative_items.append(int(row.negative_samples[i]))
    #     return [torch.LongTensor(test_users), torch.LongTensor(test_items), torch.LongTensor(negative_users),
    #             torch.LongTensor(negative_items)]

# ML1M-data

In [7]:
ml1m_dir = '/home/pollakg/polybox/CSE/master/2nd_term/Deep Learning/project/project-git/data/ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')
# Reindex
unique_user_id = ml1m_rating[['uid']].drop_duplicates().reindex() # Create df of unique users
unique_user_id['userId'] = np.arange(len(unique_user_id)) # append userId [uid, userId] [starts from 0, starts from 1]
# Merge based on same uid => add userId with corresponding fitting uid
ml1m_rating = pd.merge(ml1m_rating, unique_user_id, on=['uid'], how='left')
unique_item_id = ml1m_rating[['mid']].drop_duplicates()
unique_item_id['itemId'] = np.arange(len(unique_item_id))
ml1m_rating = pd.merge(ml1m_rating, unique_item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max()))


Range of userId is [0, 6039]
Range of itemId is [0, 3705]


## Epinions Data

In [11]:
# parse file
import json
import ast

# read file
lines = []
with open('../data/epinions/epinions_data/epinions.json', 'r') as f:
    for nb, line in enumerate(f): 
        lines.append(ast.literal_eval(line))
        
df = pd.DataFrame(lines)
epi_rating = df[["user", "item", "stars", "time"]]
epi_rating.rename(columns={'stars':'rating', 'time':'timestamp'}, inplace=True)
# Reindex
unique_user_id = epi_rating[['user']].drop_duplicates().reindex() # Create df of unique users
unique_user_id['userId'] = np.arange(len(unique_user_id)) # append userId [uid, userId] [starts from 0, starts from 1]
# Merge based on same uid => add userId with corresponding fitting uid
epi_rating = pd.merge(epi_rating, unique_user_id, on=['user'], how='left')
unique_item_id = epi_rating[['item']].drop_duplicates()
unique_item_id['itemId'] = np.arange(len(unique_item_id))
epi_rating = pd.merge(epi_rating, unique_item_id, on=['item'], how='left')
epi_rating = epi_rating[['userId', 'itemId', 'rating', 'timestamp']]
epi_rating

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


Unnamed: 0,userId,itemId,rating,timestamp
0,0,0,4.0,1027296000
1,1,1,2.0,1201305600
2,2,1,4.0,1118016000
3,3,2,4.0,1149292800
4,4,3,5.0,1012262400
...,...,...,...,...
188473,2351,41268,2.0,1012608000
188474,60,41268,4.0,1016409600
188475,116258,41268,1.0,1051747200
188476,4263,41268,5.0,1029974400


## Generating statiscts/features for each data set

Epinsion

In [None]:
df = DataGenerator(epi_rating)


### Move Lens

In [8]:
# Movielense
df = DataGenerator(ml1m_rating)
print(df.__dict__.keys())
df.save("/home/pollakg/polybox/CSE/master/2nd_term/Deep Learning/project/project-git/data/ml-1m/ml.npz")

dict_keys(['ratings', 'user_pool', 'item_pool', 'negatives', 'train_ratings', 'test_ratings'])


## Just for testing purposes

In [None]:
ml1m_dir = '/home/pollakg/polybox/CSE/master/2nd_term/Deep Learning/project/project-git/data/ml-1m/ml.npz'
_data = np.load(ml1m_dir, allow_pickle=True)
train_data = _data['train_data'][:, :2]
test_data_ = _data['test_data']
test_data_.tolist().keys()

# # Neighborhoods
# user_items = defaultdict(set)
# item_users = defaultdict(set)
# for u, i in train_data:
#     user_items[u].add(i)
#     item_users[i].add(u)
# # Get a list version so we do not need to perform type casting
# item_users_list = {k: list(v) for k, v in item_users.items()}
# # maximum number of users that rated an item i
# _max_user_neighbors = max([len(x) for x in self.item_users.values()])
# user_items = dict(self.user_items)
# item_users = dict(self.item_users)


In [316]:
files = ["citeulike-a.npz", "pinterest.npz"]
npz_files = []
test = {}
train = {}
for f in files:
    npz = np.load(f, allow_pickle=True)
    train[f] = npz["train_data"]
    test[f] = npz["test_data"]
    #print(len(test[f]), len(test[f][0]),"[1, ",len(test[f][0][1]),"]")

train["pinterest.npz"][:, :2]
test["pinterest.npz"].tolist()

In [328]:
print(type(train["pinterest.npz"]))
print(type(test["pinterest.npz"]))
print(train["pinterest.npz"].shape)
print(test["pinterest.npz"].shape)
print(train["pinterest.npz"][1:20])
print(len(test["pinterest.npz"].tolist()[0][1]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(1445622, 2)
()
[[ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  7]
 [ 0  8]
 [ 0  9]
 [ 0 10]
 [ 0 11]
 [ 0  0]
 [ 0 13]
 [ 0 14]
 [ 0 15]
 [ 0 16]
 [ 0 17]
 [ 0 18]
 [ 0 19]
 [ 0 20]
 [ 0  0]]
100
