In [1]:
import pandas as pd
import numpy as np

dir_path = './Amazon_Review/CDs/'
rating_file = 'ratings_CDs_and_Vinyl.csv'

In [2]:
def read_user_rating_records():
    col_names = ['user_id', 'item_id', 'rating', 'timestamp']
    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')
    return data_records

data_records = read_user_rating_records()
data_records.head()
data_records.iloc[[1, 10, 20]]

Unnamed: 0,user_id,item_id,rating,timestamp
1,AHG1GTQZUYNJN,1393774,5.0,1372723200
10,A1WX42M589VAMQ,1393774,5.0,1167350400
20,A2UA9KKUQCTEIN,1501348,5.0,1381017600


In [3]:
print(len(data_records['user_id'].value_counts()), len(data_records['item_id'].value_counts()))

1578597 486360


In [4]:
data_records.loc[data_records.rating < 4, 'rating'] = 0
data_records.loc[data_records.rating >= 4, 'rating'] = 1
data_records = data_records[data_records.rating > 0]
print(len(data_records['user_id'].unique()), len(data_records['item_id'].unique()))

1425149 457623


In [5]:
from copy import deepcopy
def remove_infrequent_items(data, min_counts=5):
    df = deepcopy(data)
    counts = df['item_id'].value_counts()
    df = df[df["item_id"].isin(counts[counts >= min_counts].index)]

    print("items with < {} interactoins are removed".format(min_counts))
    # print(df.describe())
    return df

def remove_infrequent_users(data, min_counts=10):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]

    print("users with < {} interactoins are removed".format(min_counts))
    # print(df.describe())
    return df

filtered_data = remove_infrequent_users(data_records, 10)
filtered_data = remove_infrequent_items(filtered_data, 8)
print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))

users with < 10 interactoins are removed
items with < 8 interactoins are removed
num of users:31668, num of items:24635


In [6]:
print(filtered_data.groupby('item_id').size().sort_values(ascending=True)[:10])
print(filtered_data.groupby('user_id').size().sort_values(ascending=True)[:10])

item_id
B0001GNDN4    8
B0000026AN    8
B0000026B4    8
B00004X0KU    8
B000002UJ3    8
B000002UIK    8
B000002UI8    8
B0000026C5    8
B00004X0QN    8
B00004X0TH    8
dtype: int64
user_id
A02039013W06XH9FVVFUZ    1
A151Z5T4K8XK9X           1
A3DEWYL97V73PI           1
A19PW02C37RFX1           1
A3DMPUPJ248CGI           1
A21ON7MBZ14C8D           1
A1513MWN910XKQ           1
A3DOCWNJH2NLA5           1
A3DU61S18GW7NW           1
A3DUY3FHH21T3T           1
dtype: int64


In [7]:
# read item's reviews
item_list = filtered_data['item_id'].unique()
item_set = set(item_list)

print(item_list[:10])

review_file = 'reviews_CDs_and_Vinyl_5.json.gz'

import json
import gzip

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)
        # yield json.dumps(eval(l))

review_dict = dict()  # [review_id] = review_text
review_helpful = dict()
for l in parse(dir_path + review_file):
    if l['asin'] in item_set:
        if l['asin'] in review_dict:
            if l['helpful'][0] / float(l['helpful'][1] + 0.01) > review_helpful[l['asin']] and len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)
        else:
            if len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)

# print review_dict['1300966947']
            

['0738900370' '0738900672' '0738919039' '0760135002' '0767804341'
 '0769716903' '0769720994' '0780018664' '0780607287' '0783112793']


In [8]:
# delete items without reviews
item_without_review = []
for item_id in item_list:
    if item_id not in review_dict:
        item_without_review.append(item_id)

print(item_without_review)

for item_id in item_without_review:
    filtered_data = filtered_data[filtered_data['item_id'] != item_id]

item_list = filtered_data['item_id'].unique()
print(len(item_list))

[]
24635


In [9]:
for item_id, review in review_dict.iteritems():
    if len(review) < 5:
        print(item_id)
# print review_dict['B002IUAUI2']

In [10]:
# convert records to sequential data per user
def convert_data(data):
    # for each user, sort by timestamps
    df = deepcopy(data)
    df_ordered = df.sort_values(['timestamp'], ascending=True)
    data = df_ordered.groupby('user_id')['item_id'].apply(list)
    #print(data)
    #time_l = df_ordered.groupby('user')['checkin_time'].apply(list)
    #print(time_l)
    print("succressfully created sequencial data! head:", data.head(5))
    unique_data = df_ordered.groupby('user_id')['item_id'].nunique()
    data = data[unique_data[unique_data >= 5].index]
    print(data[:10])
    print(len(data))
    return data

seq_data = convert_data(filtered_data)
print(type(seq_data))

('succressfully created sequencial data! head:', user_id
A02039013W06XH9FVVFUZ                                         [B000008UGB]
A0644664269UTSPKMVCVH    [B000003G7W, B000009HQW, B000002IIG, B0000AINI...
A08161909WK3HU7UYTMW     [B00005OMGE, B0041WLBEC, B000069JJF, B000002MH...
A099280716ZEH5UPWAN4A    [B0000C23DW, B000005B0U, B009NOVCSS, B00000136...
A1002VY75YRZYF                                    [B000003JAH, B000002BMD]
Name: item_id, dtype: object)
user_id
A0644664269UTSPKMVCVH    [B000003G7W, B000009HQW, B000002IIG, B0000AINI...
A08161909WK3HU7UYTMW     [B00005OMGE, B0041WLBEC, B000069JJF, B000002MH...
A099280716ZEH5UPWAN4A    [B0000C23DW, B000005B0U, B009NOVCSS, B00000136...
A1004AX2J2HXGL           [B00008J4P5, B000005JAC, B000007MVM, B000077VQ...
A1006V961PBMKA           [B000003TAW, B000065BW8, B00006V9A0, B00008J4P...
A100G9PJZUCJQW           [B00000ICO0, B000003494, B001E2PTJK, B000075AJ...
A100JCBNALJFAW           [B0001NBMBC, B00000099X, B0007WF1X2, B000BEZPQ...
A100T

In [11]:
user_item_dict = seq_data.to_dict()
user_mapping = []
item_set = set()
for user_id, item_list in seq_data.iteritems():
    user_mapping.append(user_id)
    for item_id in item_list:
        item_set.add(item_id)
item_mapping = list(item_set)

print(len(user_mapping), len(item_mapping))

24934 24634


In [12]:
def generate_inverse_mapping(data_list):
    inverse_mapping = dict()
    for inner_id, true_id in enumerate(data_list):
        inverse_mapping[true_id] = inner_id
    return inverse_mapping

def convert_to_inner_index(user_records, user_mapping, item_mapping):
    inner_user_records = []
    user_inverse_mapping = generate_inverse_mapping(user_mapping)
    item_inverse_mapping = generate_inverse_mapping(item_mapping)

    for user_id in range(len(user_mapping)):
        real_user_id = user_mapping[user_id]
        item_list = list(user_records[real_user_id])
        for index, real_item_id in enumerate(item_list):
            item_list[index] = item_inverse_mapping[real_item_id]
        inner_user_records.append(item_list)

    return inner_user_records, user_inverse_mapping, item_inverse_mapping

inner_data_records, user_inverse_mapping, item_inverse_mapping = convert_to_inner_index(user_item_dict, user_mapping, item_mapping)
print(inner_data_records[:5])

[[15915, 2807, 9857, 9850, 19895, 9988, 23902], [5450, 2440, 14127, 8223, 22719, 21934, 15795, 23756, 3482, 9481, 3170, 21779, 2587, 15610, 18908, 16178, 361, 190, 5849, 767, 4487, 4178, 3480, 2064, 2092, 5659, 9573, 11301, 20098], [5547, 5761, 17353, 6837, 10934, 20543, 7669, 2092, 18239], [19975, 13670, 3652, 8475, 1490], [3378, 20231, 526, 19975, 22281, 1820, 574, 16703, 7023, 14325, 24508, 19701, 8790]]


In [13]:
from scipy.sparse import csr_matrix

def generate_rating_matrix(train_set, num_users, num_items):
    # three lists are used to construct sparse matrix
    row = []
    col = []
    data = []
    for user_id, article_list in enumerate(train_set):
        for article in article_list:
            row.append(user_id)
            col.append(article)
            data.append(1)

    row = np.array(row)
    col = np.array(col)
    data = np.array(data)
    rating_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))

    return rating_matrix

rating_matrix = generate_rating_matrix(inner_data_records, len(user_mapping), len(item_mapping))
rating_matrix = rating_matrix.transpose()

from sklearn.metrics.pairwise import cosine_similarity

relation_matrix = cosine_similarity(rating_matrix)

In [14]:
rating_matrix.nnz / float(len(user_mapping) * len(item_mapping))

0.0007782956683325232

In [15]:
np.fill_diagonal(relation_matrix, 0)
threshold = 0.3
max_count = 0
for i in range(len(item_mapping)):
    max_count = max(np.count_nonzero((relation_matrix[i] >= threshold) == True), max_count)
    
print(max_count)

count = 0
for i in range(len(item_mapping)):
    if np.count_nonzero((relation_matrix[i] >= threshold) == True) > 0:
        count += 1

print(count)
print(np.max(relation_matrix))
print(relation_matrix[0])
print(relation_matrix[-1])

75
11751
0.948683298051
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]


In [16]:
relation_matrix[relation_matrix < threshold] = 0
relation_matrix[relation_matrix > 0] = 1
relation_matrix = csr_matrix(relation_matrix)
print(len(user_mapping), len(item_mapping))

24934 24634


In [17]:
# process review content
import re
import nltk

# generate the whole document
all_review = []
for item_id in item_mapping:
    all_review.append([review_dict[item_id]])

# use nltk to remove stopwords, and stemming each word
from nltk.corpus import stopwords
stopwords_set = set(stopwords.words('english'))
porter_stemmer = nltk.PorterStemmer()

review_str = []
for i, movie in enumerate(all_review):
    # Use regular expressions to do a find-and-replace
    letters_only = re.sub("[^a-zA-Z]",  # The pattern to search for
                          " ",  # The pattern to replace it with
                          movie[0])  # The text to search
    # print letters_only

    letters_only = letters_only.lower()
    tokens = nltk.word_tokenize(letters_only)

    tokens = [w for w in tokens if w.lower() not in stopwords_set]
    # print tokens

    porter = [porter_stemmer.stem(t) for t in tokens]
    # print porter
    all_review[i] = porter
    review_str.append(' '.join(porter))

print(review_str[-1])

studi chart past month think predict would ciara f missi elliot step cent disco inferno snoop dogg f pharrel drop like hot jennif lopez get right nelli f tim mcgraw game f cent lenni kravitz ladi ludacri get back destini child soldier lose breath u vertigo killer somebodi told mr brightsid gwen stefani rich girl wait john mayer daughter eminem like toy soldier lose twista f faith evan hope modest mous float usher boo caught good charlott wan na live ashanti u ja rule f r kelli ashanti wonder lloyd bank f avant karma jadakiss f mariah carey u make wan na lindsay lohan


In [18]:
# convert to bag-of-words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words=None, min_df=3)
word_counts = vectorizer.fit_transform(review_str)
vocab = vectorizer.get_feature_names()

print(len(vocab))
print(word_counts.data.max())
print(word_counts.data.min())
print(len(item_mapping))

24341
139
1
24634


In [19]:
rating_matrix.nnz / float(len(user_mapping) * len(item_mapping))

0.0007782956683325232

In [20]:
# store bag-of-words to file
def vocabulary_to_file(vocab):
    f0 = open('vocabulary.txt', 'w')

    for word in vocab:
        f0.write(word + '\n')
    f0.close()


def word_count_to_file(item_list, word_count):
    f0 = open('word_counts.txt', 'w')
    for i, document in enumerate(word_count):
        indices = document.indices
        counts = document.data
        num_words = document.count_nonzero()

        f0.write(str(item_list[i]) + ' ' + str(num_words))
        for j, indice in enumerate(indices):
            f0.write(' ' + str(indice) + ':' + str(counts[j]))
        f0.write('\n')
    f0.close()

vocabulary_to_file(vocab)
word_count_to_file(item_mapping, word_counts)

In [21]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)


In [22]:
save_obj(user_item_dict, 'Electronics_user_records')
save_obj(user_mapping, 'Electronics_user_mapping')
save_obj(item_mapping, 'Electronics_item_mapping')
save_obj(relation_matrix, 'item_relation')

In [23]:
print(vocab[:10])
print(all_review[-1])
print(review_str[-1])

[u'aa', u'aaa', u'aaaah', u'aacut', u'aad', u'aah', u'aaliyah', u'aaron', u'ab', u'abacab']
[u'studi', u'chart', u'past', u'month', u'think', u'predict', u'would', u'ciara', u'f', u'missi', u'elliot', u'step', u'cent', u'disco', u'inferno', u'snoop', u'dogg', u'f', u'pharrel', u'drop', u'like', u'hot', u'jennif', u'lopez', u'get', u'right', u'nelli', u'f', u'tim', u'mcgraw', u'game', u'f', u'cent', u'lenni', u'kravitz', u'ladi', u'ludacri', u'get', u'back', u'destini', u'child', u'soldier', u'lose', u'breath', u'u', u'vertigo', u'killer', u'somebodi', u'told', u'mr', u'brightsid', u'gwen', u'stefani', u'rich', u'girl', u'wait', u'john', u'mayer', u'daughter', u'eminem', u'like', u'toy', u'soldier', u'lose', u'twista', u'f', u'faith', u'evan', u'hope', u'modest', u'mous', u'float', u'usher', u'boo', u'caught', u'good', u'charlott', u'wan', u'na', u'live', u'ashanti', u'u', u'ja', u'rule', u'f', u'r', u'kelli', u'ashanti', u'wonder', u'lloyd', u'bank', u'f', u'avant', u'karma', u'jadakis

In [24]:
word_to_index = dict()
for w_id, word in enumerate(vocab):
    word_to_index[word] = w_id

all_review_index = []
for i in range(len(review_str)):
    cur_review = review_str[i].split(' ')
    cur_index = []
    for word in cur_review:
        if word in word_to_index:
            cur_index.append(word_to_index[word])
    all_review_index.append(cur_index)
    
print(all_review_index[-1])

[20664, 3552, 15706, 14019, 21551, 16577, 24030, 3779, 13844, 6608, 20430, 3431, 5737, 10594, 19770, 5934, 16009, 6191, 12315, 10073, 11127, 12529, 8568, 17916, 14527, 21710, 13308, 8391, 3431, 12164, 11763, 11866, 12666, 8568, 1420, 5476, 3650, 19847, 12547, 2625, 23122, 11574, 19880, 21814, 14191, 2679, 9228, 20414, 17869, 8666, 23409, 11220, 13245, 5108, 6675, 12315, 21941, 19847, 12547, 22331, 7332, 7043, 10030, 13923, 14166, 7806, 22921, 2396, 3357, 8831, 3545, 23448, 14361, 12416, 1125, 10968, 18269, 11486, 1125, 23954, 12433, 1558, 1333, 11420, 10988, 13046, 3206, 12888, 23448, 14361, 12355, 12475]


In [25]:
# store word sequence to a file
save_obj(all_review_index, 'review_word_sequence')

In [26]:
print(seq_data[-1])
user_inverse_mapping = generate_inverse_mapping(user_mapping)
item_inverse_mapping = generate_inverse_mapping(item_mapping)
print(user_item_dict[user_mapping[-1]])
tmp = []
for item_id in seq_data[-1]:
    tmp.append(item_inverse_mapping[item_id])
print(sorted(tmp))

['B000FBFTD2', 'B000005S5N', 'B000XPU5NY', 'B000007O09', 'B002A9OJ0Q', 'B001PNVYMO', 'B000002L7Y', 'B000Y35A94', 'B006CS7UXO', 'B003U0I9U0', 'B00004SYWZ', 'B0044WWLMU', 'B000001EMN', 'B000024T5L']
['B000FBFTD2', 'B000005S5N', 'B000XPU5NY', 'B000007O09', 'B002A9OJ0Q', 'B001PNVYMO', 'B000002L7Y', 'B000Y35A94', 'B006CS7UXO', 'B003U0I9U0', 'B00004SYWZ', 'B0044WWLMU', 'B000001EMN', 'B000024T5L']
[91, 1957, 3300, 3737, 7260, 11032, 11439, 13192, 15231, 17233, 20633, 22262, 22414, 23704]


In [27]:
print(all_review[-1])

[u'studi', u'chart', u'past', u'month', u'think', u'predict', u'would', u'ciara', u'f', u'missi', u'elliot', u'step', u'cent', u'disco', u'inferno', u'snoop', u'dogg', u'f', u'pharrel', u'drop', u'like', u'hot', u'jennif', u'lopez', u'get', u'right', u'nelli', u'f', u'tim', u'mcgraw', u'game', u'f', u'cent', u'lenni', u'kravitz', u'ladi', u'ludacri', u'get', u'back', u'destini', u'child', u'soldier', u'lose', u'breath', u'u', u'vertigo', u'killer', u'somebodi', u'told', u'mr', u'brightsid', u'gwen', u'stefani', u'rich', u'girl', u'wait', u'john', u'mayer', u'daughter', u'eminem', u'like', u'toy', u'soldier', u'lose', u'twista', u'f', u'faith', u'evan', u'hope', u'modest', u'mous', u'float', u'usher', u'boo', u'caught', u'good', u'charlott', u'wan', u'na', u'live', u'ashanti', u'u', u'ja', u'rule', u'f', u'r', u'kelli', u'ashanti', u'wonder', u'lloyd', u'bank', u'f', u'avant', u'karma', u'jadakiss', u'f', u'mariah', u'carey', u'u', u'make', u'wan', u'na', u'lindsay', u'lohan']


In [28]:
for i in range(word_counts.shape[0]):
    if word_counts.getrow(i).getnnz() == 0:
        print(i)