In [1]:
import pandas as pd
import numpy as np

dir_path = './Amazon_Review/Books/'
rating_file = 'ratings_Books.csv'

In [2]:
def read_user_rating_records():
    col_names = ['user_id', 'item_id', 'rating', 'timestamp']
    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')
    return data_records

data_records = read_user_rating_records()
data_records.head()
data_records.iloc[[1, 10, 20]]

Unnamed: 0,user_id,item_id,rating,timestamp
1,A2IIIDRK3PRRZY,116,1.0,1395619200
10,AYEDW3BFK53XK,13714,5.0,1325462400
20,A2GKR2Q7MD8DG4,29831,5.0,1393286400


In [3]:
print(len(data_records['user_id'].value_counts()), len(data_records['item_id'].value_counts()))

8026324 2330066


In [4]:
data_records.loc[data_records.rating < 4, 'rating'] = 0
data_records.loc[data_records.rating >= 4, 'rating'] = 1
data_records = data_records[data_records.rating > 0]
print(len(data_records['user_id'].unique()), len(data_records['item_id'].unique()))

7118528 2138299


In [5]:
from copy import deepcopy
def remove_infrequent_items(data, min_counts=5):
    df = deepcopy(data)
    counts = df['item_id'].value_counts()
    df = df[df["item_id"].isin(counts[counts >= min_counts].index)]

    print("items with < {} interactoins are removed".format(min_counts))
    # print(df.describe())
    return df

def remove_infrequent_users(data, min_counts=10):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]

    print("users with < {} interactoins are removed".format(min_counts))
    # print(df.describe())
    return df

filtered_data = remove_infrequent_users(data_records, 20)
filtered_data = remove_infrequent_items(filtered_data, 20)
print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))

users with < 20 interactoins are removed
items with < 20 interactoins are removed
num of users:76696, num of items:41265


In [6]:
print(filtered_data.groupby('item_id').size().sort_values(ascending=True)[:10])
print(filtered_data.groupby('user_id').size().sort_values(ascending=True)[:10])

item_id
0373876998    20
1499295952    20
B00KF073C8    20
1499293224    20
0778315126    20
B00G0KFZ7C    20
B00KEZV34S    20
B00G00GUVM    20
1499198469    20
B00C1NCUL6    20
dtype: int64
user_id
A18Y4FI13QPQ8V    1
A384ODJB85EGS5    1
A1GORK6WGLQQF2    1
A1GOXX94PDPR59    1
A1VFG5SVYMBH7K    1
A1VF5LGIODDNG7    1
A385OXH06QRH3W    1
A5Q65JECU7D1C     1
A5PJHEFSM7PZ3     1
A388N4UTHDZUI4    1
dtype: int64


In [7]:
# read item's reviews
item_list = filtered_data['item_id'].unique()
item_set = set(item_list)

print(item_list[:10])

review_file = 'reviews_Books_5.json.gz'

import json
import gzip

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)
        # yield json.dumps(eval(l))

review_dict = dict()  # [review_id] = review_text
review_helpful = dict()
for l in parse(dir_path + review_file):
    if l['asin'] in item_set:
        if l['asin'] in review_dict:
            if l['helpful'][0] / float(l['helpful'][1] + 0.01) > review_helpful[l['asin']] and len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)
        else:
            if len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)

# print review_dict['1300966947']
            

['000100039X' '0002007770' '0002051850' '000215725X' '0002219417'
 '000222383X' '0002226618' '000224053X' '0002242052' '0002247399']


In [8]:
# delete items without reviews
item_without_review = []
for item_id in item_list:
    if item_id not in review_dict:
        item_without_review.append(item_id)

print(item_without_review)

for item_id in item_without_review:
    filtered_data = filtered_data[filtered_data['item_id'] != item_id]

item_list = filtered_data['item_id'].unique()
print(len(item_list))

[]
41265


In [9]:
for item_id, review in review_dict.items():
    if len(review) < 5:
        print(item_id)
# print review_dict['B002IUAUI2']

In [11]:
# convert records to sequential data per user
def convert_data(data):
    # for each user, sort by timestamps
    df = deepcopy(data)
    df_ordered = df.sort_values(['timestamp'], ascending=True)
    data = df_ordered.groupby('user_id')['item_id'].apply(list)
    #print(data)
    #time_l = df_ordered.groupby('user')['checkin_time'].apply(list)
    #print(time_l)
    print("succressfully created sequencial data! head:", data.head(5))
    unique_data = df_ordered.groupby('user_id')['item_id'].nunique()
    data = data[unique_data[unique_data >= 10].index]
    print(data[:10])
    print(len(data))
    return data

seq_data = convert_data(filtered_data)
print(type(seq_data))

succressfully created sequencial data! head: user_id
A002359833QJM7OQHCXWY    [B00BUKRALG, B00BWY3UKU, B004OEKH7Y, 076420477...
A00463782V7TKAP9EMNL     [B00ES4C28C, 1941450008, B004XJ6922, 148481477...
A0047322388NOTO4N8SKD     [0884191338, 0849947200, 0333516516, 1589267109]
A00579222Q4YKY0J53RLA    [193415766X, 0345492641, 1451608160, 193639924...
A006458827ALF2J23JJTO    [1489539042, 1482616319, B00DBE8QDU, B00DUFCJ1...
Name: item_id, dtype: object
user_id
A002359833QJM7OQHCXWY    [B00BUKRALG, B00BWY3UKU, B004OEKH7Y, 076420477...
A00463782V7TKAP9EMNL     [B00ES4C28C, 1941450008, B004XJ6922, 148481477...
A00579222Q4YKY0J53RLA    [193415766X, 0345492641, 1451608160, 193639924...
A006458827ALF2J23JJTO    [1489539042, 1482616319, B00DBE8QDU, B00DUFCJ1...
A0092581WFYQNV4KMUZ3     [0425263916, 0615744257, 0060734019, 032147404...
A0099735VDZ3HDCAAYKL     [0451228219, 0451229444, B008XOWVVG, B006BFX4U...
A010971113OD625HDB6X8    [0606262520, 0985023058, 0373210515, 148270683...
A010997525

In [12]:
user_item_dict = seq_data.to_dict()
user_mapping = []
item_set = set()
for user_id, item_list in seq_data.iteritems():
    user_mapping.append(user_id)
    for item_id in item_list:
        item_set.add(item_id)
item_mapping = list(item_set)

print(len(user_mapping), len(item_mapping))

52406 41264


In [13]:
def generate_inverse_mapping(data_list):
    inverse_mapping = dict()
    for inner_id, true_id in enumerate(data_list):
        inverse_mapping[true_id] = inner_id
    return inverse_mapping

def convert_to_inner_index(user_records, user_mapping, item_mapping):
    inner_user_records = []
    user_inverse_mapping = generate_inverse_mapping(user_mapping)
    item_inverse_mapping = generate_inverse_mapping(item_mapping)

    for user_id in range(len(user_mapping)):
        real_user_id = user_mapping[user_id]
        item_list = list(user_records[real_user_id])
        for index, real_item_id in enumerate(item_list):
            item_list[index] = item_inverse_mapping[real_item_id]
        inner_user_records.append(item_list)

    return inner_user_records, user_inverse_mapping, item_inverse_mapping

inner_data_records, user_inverse_mapping, item_inverse_mapping = convert_to_inner_index(user_item_dict, user_mapping, item_mapping)
print(inner_data_records[:5])

[[2943, 9318, 6295, 3211, 30412, 37719, 5338, 39088, 3321, 22536, 6107], [4010, 9926, 17692, 32858, 38332, 33292, 29533, 19637, 24784, 6624, 35398, 5187, 6639, 31289, 5330, 10625, 21043, 10026, 34328, 38276], [13874, 32449, 195, 38186, 18105, 25384, 31149, 25300, 34987, 33252, 4178, 13725, 22148], [33172, 36596, 6106, 7487, 40512, 3205, 24847, 23494, 28024, 14166, 37537, 1002, 9174, 28901, 33507, 24688, 6719, 25446, 38409, 9335, 28950, 34620, 34999, 3066, 40394, 7835, 1697, 6750, 37529, 19262, 10556, 28446, 31307, 6631, 12231, 28245, 40699, 8256], [4217, 24644, 13110, 32465, 27742, 8263, 34335, 12933, 29505, 21141, 20076, 4066, 25569, 32584, 13665, 13271, 25060, 8452, 7648]]


In [14]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [15]:
save_obj(inner_data_records, 'Books_item_sequences')
save_obj(user_mapping, 'Books_user_mapping')
save_obj(item_mapping, 'Books_item_mapping')

In [None]:
from scipy.sparse import csr_matrix

def generate_rating_matrix(train_set, num_users, num_items):
    # three lists are used to construct sparse matrix
    row = []
    col = []
    data = []
    for user_id, article_list in enumerate(train_set):
        for article in article_list:
            row.append(user_id)
            col.append(article)
            data.append(1)

    row = np.array(row)
    col = np.array(col)
    data = np.array(data)
    rating_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))

    return rating_matrix

rating_matrix = generate_rating_matrix(inner_data_records, len(user_mapping), len(item_mapping))
rating_matrix = rating_matrix.transpose()

from sklearn.metrics.pairwise import cosine_similarity

relation_matrix = cosine_similarity(rating_matrix)

In [None]:
rating_matrix.nnz / float(len(user_mapping) * len(item_mapping))

In [None]:
np.fill_diagonal(relation_matrix, 0)
max_count = 0
for i in range(len(item_mapping)):
    max_count = max(np.count_nonzero((relation_matrix[i] >= 0.2) == True), max_count)
    
print max_count

count = 0
for i in range(len(item_mapping)):
    if np.count_nonzero((relation_matrix[i] >= 0.2) == True) > 0:
        count += 1

print count
print np.max(relation_matrix)
print relation_matrix[0]
print relation_matrix[-1]

In [None]:
relation_matrix[relation_matrix < 0.2] = 0
relation_matrix[relation_matrix > 0] = 1
relation_matrix = csr_matrix(relation_matrix)
print len(user_mapping), len(item_mapping)

In [None]:
# process review content
import re
import nltk

# generate the whole document
all_review = []
for item_id in item_mapping:
    all_review.append([review_dict[item_id]])

# use nltk to remove stopwords, and stemming each word
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))
porter_stemmer = nltk.PorterStemmer()

review_str = []
for i, movie in enumerate(all_review):
    # Use regular expressions to do a find-and-replace
    letters_only = re.sub("[^a-zA-Z]",  # The pattern to search for
                          " ",  # The pattern to replace it with
                          movie[0])  # The text to search
    # print letters_only

    letters_only = letters_only.lower()
    tokens = nltk.word_tokenize(letters_only)

    tokens = [w for w in tokens if w.lower() not in stopwords_set]
    # print tokens

    porter = [porter_stemmer.stem(t) for t in tokens]
    # print porter
    all_review[i] = porter
    review_str.append(' '.join(porter))

print review_str[-1]

In [None]:
# convert to bag-of-words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, min_df=3)
word_counts = vectorizer.fit_transform(review_str)
vocab = vectorizer.get_feature_names()

print len(vocab)
print word_counts.data.max()
print word_counts.data.min()
print len(item_mapping)

In [None]:
rating_matrix.nnz / float(len(user_mapping) * len(item_mapping))

In [None]:
# store bag-of-words to file
def vocabulary_to_file(vocab):
    f0 = open('vocabulary.txt', 'w')

    for word in vocab:
        f0.write(word + '\n')
    f0.close()


def word_count_to_file(item_list, word_count):
    f0 = open('word_counts.txt', 'w')
    for i, document in enumerate(word_count):
        indices = document.indices
        counts = document.data
        num_words = document.count_nonzero()

        f0.write(str(item_list[i]) + ' ' + str(num_words))
        for j, indice in enumerate(indices):
            f0.write(' ' + str(indice) + ':' + str(counts[j]))
        f0.write('\n')
    f0.close()

vocabulary_to_file(vocab)
word_count_to_file(item_mapping, word_counts)

In [None]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)


In [None]:
save_obj(user_item_dict, 'Books_user_records')
save_obj(user_mapping, 'Books_user_mapping')
save_obj(item_mapping, 'Books_item_mapping')
save_obj(relation_matrix, 'item_relation')

In [None]:
print vocab[:10]
print all_review[-1]
print review_str[-1]

In [None]:
word_to_index = dict()
for w_id, word in enumerate(vocab):
    word_to_index[word] = w_id

all_review_index = []
for i in range(len(review_str)):
    cur_review = review_str[i].split(' ')
    cur_index = []
    for word in cur_review:
        if word in word_to_index:
            cur_index.append(word_to_index[word])
    all_review_index.append(cur_index)
    
print all_review_index[-1]

In [None]:
# store word sequence to a file
save_obj(all_review_index, 'review_word_sequence')

In [None]:
print seq_data[-1]
user_inverse_mapping = generate_inverse_mapping(user_mapping)
item_inverse_mapping = generate_inverse_mapping(item_mapping)
print user_item_dict[user_mapping[-1]]
tmp = []
for item_id in seq_data[-1]:
    tmp.append(item_inverse_mapping[item_id])
print sorted(tmp)

In [None]:
print all_review[-1]

In [None]:
for i in range(word_counts.shape[0]):
    if word_counts.getrow(i).getnnz() == 0:
        print i