In [15]:
import pandas as pd
import numpy as np

dir_path = './Amazon_Review/CDs/'
rating_file = 'ratings_CDs_and_Vinyl.csv'

In [16]:
def read_user_rating_records():
    col_names = ['user_id', 'item_id', 'rating', 'timestamp']
    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')
    return data_records

data_records = read_user_rating_records()
data_records.head()
data_records.iloc[[1, 10, 20]]

Unnamed: 0,user_id,item_id,rating,timestamp
1,AHG1GTQZUYNJN,1393774,5.0,1372723200
10,A1WX42M589VAMQ,1393774,5.0,1167350400
20,A2UA9KKUQCTEIN,1501348,5.0,1381017600


In [17]:
print(len(data_records['user_id'].value_counts()), len(data_records['item_id'].value_counts()))

1578597 486360


In [18]:
data_records.loc[data_records.rating < 4, 'rating'] = 0
data_records.loc[data_records.rating >= 4, 'rating'] = 1
data_records = data_records[data_records.rating > 0]
print(len(data_records['user_id'].unique()), len(data_records['item_id'].unique()))

1425149 457623


In [26]:
from copy import deepcopy
def remove_infrequent_items(data, min_counts=5):
    df = deepcopy(data)
    counts = df['item_id'].value_counts()
    df = df[df["item_id"].isin(counts[counts >= min_counts].index)]

    print("items with < {} interactoins are removed".format(min_counts))
    # print(df.describe())
    return df

def remove_infrequent_users(data, min_counts=10):
    df = deepcopy(data)
    counts = df['user_id'].value_counts()
    df = df[df["user_id"].isin(counts[counts >= min_counts].index)]

    print("users with < {} interactoins are removed".format(min_counts))
    # print(df.describe())
    return df
filtered_data = remove_infrequent_users(data_records, 5)
filtered_data = remove_infrequent_items(filtered_data, 8)
print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))

users with < 5 interactoins are removed
items with < 8 interactoins are removed
num of users:88016, num of items:35287


In [27]:
print(filtered_data.groupby('item_id').size().sort_values(ascending=True)[:10])
print(filtered_data.groupby('user_id').size().sort_values(ascending=True)[:10])

item_id
B00004TAH2    8
B0009F66PY    8
B0009EZ0Q6    8
B004ZLBTNW    8
B00001T3H5    8
B004ZLBU6I    8
B0009ESYAK    8
B001RXB4PO    8
B00001U035    8
B000003Z9K    8
dtype: int64
user_id
ADIHIVQWXQOZ5     1
A2SXWBOWFJTG6Y    1
A62B55P1JZD5O     1
A2SXGE7QAO5T7M    1
A1V29PLRGPVLH2    1
A1V2CR670P3D12    1
ALUL6JHG0DUH      1
A1V2R3MO1ZVJE2    1
A1V2SWRG6B2TDE    1
AUZAX88EP2RD5     1
dtype: int64


In [28]:
# read item's reviews
item_list = filtered_data['item_id'].unique()
item_set = set(item_list)

print(item_list[:10])

review_file = 'reviews_CDs_and_Vinyl_5.json.gz'

import json
import gzip

def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield json.loads(l)
        # yield json.dumps(eval(l))

review_dict = dict()  # [review_id] = review_text
review_helpful = dict()
for l in parse(dir_path + review_file):
    if l['asin'] in item_set:
        if l['asin'] in review_dict:
            if l['helpful'][0] / float(l['helpful'][1] + 0.01) > review_helpful[l['asin']] and len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)
        else:
            if len(l['reviewText']) > 10:
                review_dict[l['asin']] = l['reviewText']
                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)

# print review_dict['1300966947']
            

['0738900370' '0738900672' '0738919039' '0738920363' '0760135002'
 '0767804341' '0767816641' '076783822X' '0769716903' '0769720994']


In [29]:
# delete items without reviews
item_without_review = []
for item_id in item_list:
    if item_id not in review_dict:
        item_without_review.append(item_id)

print(item_without_review)

for item_id in item_without_review:
    filtered_data = filtered_data[filtered_data['item_id'] != item_id]

item_list = filtered_data['item_id'].unique()
print(len(item_list))

['1591796423', '6303439330', '6305712085', 'B000001V37', 'B000001ZWU', 'B0000024F7', 'B0000024UD', 'B000002C45', 'B000002IIN', 'B000002NUD', 'B000002O44', 'B000002SEL', 'B000003YFA', 'B0000046K4', 'B000005KWU', 'B000005L8T', 'B000005NMW', 'B000005TN9', 'B000008M3P', 'B000008UPI', 'B00000G24T', 'B00000ICH9', 'B00001X59W', 'B000028BCJ', 'B00003CK5M', 'B00003G1KA', 'B00004HYKL', 'B00004SR1J', 'B00004TW91', 'B00004WF6M', 'B00005MMN9', 'B0000648Y6', 'B000078JK0', 'B0000AQJOV', 'B0000BWVA3', 'B0000DBJAS', 'B0001IXTU4', 'B0001L7RZY', 'B0002LMLSQ', 'B0002LO7GU', 'B0002PUH1U', 'B00061NLEK', 'B00065U04A', 'B00065VSKU', 'B0008ENI02', 'B0009VNCG4', 'B000B9EYFM', 'B000BH4Z08', 'B000CAKQ0M', 'B000F2CBZU', 'B000F6YW5S', 'B000I2J7M2', 'B000JJSJBE', 'B000LC4ZNA', 'B000P46S3E', 'B000PTYUR0', 'B000Q35SAI', 'B000R0ML28', 'B000SGHSLU', 'B000VJE1VY', 'B000VKL6UW', 'B000VSYHL4', 'B000WZ8RVC', 'B000X1L852', 'B000XC906K', 'B000XVT7SM', 'B001AIG41K', 'B001BFZ0RG', 'B001BSH10M', 'B001CIOCRQ', 'B001QFNSC0', 'B001

In [30]:
for item_id, review in review_dict.items():
    if len(review) < 5:
        print(item_id)
# print review_dict['B002IUAUI2']

In [35]:
# convert records to sequential data per user
def convert_data(data):
    # for each user, sort by timestamps
    df = deepcopy(data)
    df_ordered = df.sort_values(['timestamp'], ascending=True)
    data = df_ordered.groupby('user_id')['item_id'].apply(list)
    #print(data)
    #time_l = df_ordered.groupby('user')['checkin_time'].apply(list)
    #print(time_l)
    print("succressfully created sequencial data! head:", data.head(5))
    unique_data = df_ordered.groupby('user_id')['item_id'].nunique()
    data = data[unique_data[unique_data >= 10].index]
    print(data[:10])
    print(len(data))
    return data

seq_data = convert_data(filtered_data)
print(type(seq_data))

succressfully created sequencial data! head: user_id
A010397922UKJ9QFDYFIE    [B000VH1AZQ, B000CC1TGA, B00BEJ5O6Q, B009A87WM...
A01981332V9QR1ZDG2WLS                 [B0043C3DZU, B00007E8J1, B00004WIR6]
A0200189274BR0E83NMS9                             [157252409X, B000CBVMLS]
A02039013W06XH9FVVFUZ                 [B000008UGB, B00004TRUP, B000002H25]
A02852361P0OLWYC4FW7X     [B000008FVT, B000007NZV, B000053GU0, B00004TL26]
Name: item_id, dtype: object
user_id
A08161909WK3HU7UYTMW     [B00005OMGE, B0041WLBEC, B00005YWFF, B000069JJ...
A099280716ZEH5UPWAN4A    [B000T2PRJI, B004GZSWO6, B0000C23DW, B0000025O...
A1006V961PBMKA           [B000065BW8, B000003TAW, B00006V9A0, B00008J4P...
A100JCBNALJFAW           [B0001NBMBC, B00000099X, B0007WF1X2, B0007Y4TV...
A100TF7VLG8RBV           [B000001FXX, B00002MYYI, B000007OG6, B00001QGP...
A10127132IE1A73IN1HGO    [B004T4YPI6, B0052V0NQ8, B005EIHMU4, B009LXEBD...
A1012N48J0Z65N           [B000002KFJ, B00003TFVK, B000002U8G, B0000033P...
A101L4HF0I

In [32]:
user_item_dict = seq_data.to_dict()
user_mapping = []
item_set = set()
for user_id, item_list in seq_data.iteritems():
    user_mapping.append(user_id)
    for item_id in item_list:
        item_set.add(item_id)
item_mapping = list(item_set)

print(len(user_mapping), len(item_mapping))

17052 35118


In [36]:
def generate_inverse_mapping(data_list):
    inverse_mapping = dict()
    for inner_id, true_id in enumerate(data_list):
        inverse_mapping[true_id] = inner_id
    return inverse_mapping

def convert_to_inner_index(user_records, user_mapping, item_mapping):
    inner_user_records = []
    user_inverse_mapping = generate_inverse_mapping(user_mapping)
    item_inverse_mapping = generate_inverse_mapping(item_mapping)

    for user_id in range(len(user_mapping)):
        real_user_id = user_mapping[user_id]
        item_list = list(user_records[real_user_id])
        for index, real_item_id in enumerate(item_list):
            item_list[index] = item_inverse_mapping[real_item_id]
        inner_user_records.append(item_list)

    return inner_user_records, user_inverse_mapping, item_inverse_mapping

inner_data_records, user_inverse_mapping, item_inverse_mapping = convert_to_inner_index(user_item_dict, user_mapping, item_mapping)
print(inner_data_records[:5])

[[19424, 1824, 4592, 3047, 16232, 28137, 30957, 15135, 5251, 9053, 31521, 28819, 466, 3289, 9602, 14408, 18384, 6926, 16823, 26584, 19449, 4850, 27287, 5117, 4694, 8868, 10830, 17932, 15224, 18093, 34884, 26394, 15488], [16607, 100, 23666, 9859, 6568, 31416, 15922, 31905, 21970, 4694, 16572, 3911], [8356, 25134, 3234, 28418, 25791, 16496, 20149, 12591, 17475, 19735, 12121, 32492, 21821, 6903], [18275, 9043, 6102, 9672, 27869, 1142, 10394, 34153, 7212, 24662, 14752, 14589], [30596, 4267, 28556, 17301, 21730, 10942, 29486, 25925, 25134, 13980, 13965, 14331, 27004, 2408, 4529]]


In [37]:
import pickle
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

In [38]:
save_obj(inner_data_records, 'CDs_item_sequences')
save_obj(user_mapping, 'CDs_user_mapping')
save_obj(item_mapping, 'CDs_item_mapping')