# MMSBR preprocess step 1

In [1]:
import gzip
from collections import defaultdict
from datetime import datetime
import os
import time
import pickle
import pandas as pd
import json as json
import numpy as np

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

meta_dataset = 'Grocery_and_Gourmet_Food'
# Grocery_and_Gourmet_Food/Electronics/Home_and_Kitchen
# datasets_name = meta_dataset + '_5'

price_level_num = 99
time_interval = 60*60*24
data_path = 'originalData/meta_' + meta_dataset + '.json.gz'
df_item = getDF(data_path)


# item attributes, itemID-price-category-title-imageURL-imageURLHigh
item_property = df_item[['asin', 'price', 'category', 'title', 'imageURL', 'imageURLHighRes']]

item_property = item_property.dropna(axis=0)

# filtering out items with invalid text or images
if meta_dataset == 'Cell_Phones_and_Accessories':
    item_property = item_property[item_property['asin'] != 'B0187XZOUC']
    item_property = item_property[item_property['asin'] != 'B00M4G42W0']
elif meta_dataset == 'Grocery_and_Gourmet_Food':
    item_property = item_property[~item_property['asin'].isin(stopItem)] 
elif meta_dataset == 'Sports_and_Outdoors':
    stopItem = ['B00R58SRYQ', 'B00E9A52YQ', 'B00HPT7IF4', 'B004EPXXQU', 'B004ELBRK8', 'B00DVOLLKA', 'B011RTEN8C', 'B00PBEHHAG', 'B009TNKXRO', 'B00CP6AML4', 'B00E1EU0Q0', 'B00RHA8O9K', 'B00AG270IM', 'B00P8C789Q', 'B00CN9OXWC', 'B004JYNAAA', 'B00NX46FX6', 'B00NX46XU6', 'B003BLOTX4', 'B00LFITP9S', 'B00DDTI29Q', 'B008B9D2ZC', 'B00MMTRHGW', 'B00MMTXAEK', 'B00DVNBTWG', 'B00914S9GI', 'B001IHERIA', 'B00RH9XOCI', 'B00L1R8Y72', 'B00BC1ZVP0', 'B01HEGK1S8', 'B00R7AXISW', 'B003BLOUCO', 'B0049MHMEC', 'B004J2S3I6', 'B00R59X3QM', 'B00MWF4B9M', 'B00BPIDGFM', 'B005W3C9C2', 'B00HE4PS2U', 'B00KNE5VGQ', 'B00L4IGV3M', 'B00L4IGRR2', 'B00TB0BL94', 'B00TB0MWTM', 'B01HEGJX4Q', 'B01HEGK5U2', 'B008I7Q81W', 'B0108NM3SA', 'B001154PLI', 'B00LFJFUMS', 'B01AANWPWM', 'B00P8DQZHQ', 'B00DIIS346', 'B00FSAJFUI', 'B00DVOKBW4', 'B00R58UIW0', 'B00EQ6OSOI', 'B0147JS4DO', 'B0147JSTUC', 'B0155UIGSM', 'B00QH6LH4O', 'B019Z7T8QK', 'B019Z7THLG', 'B0054PD0SU', 'B00KZHR24A', 'B00G7LEEXK', 'B00QVZJUMM', 'B00QVB3EIC', 'B01FXOF0CU', 'B001MS5NNI', 'B001MS9EYM', 'B018RK85F6', 'B00K5T5FN8', 'B00F1JVW0W', 'B01HEGJX3C', 'B00AYS9LGC', 'B00ONACPR4', 'B00DDTZDXY', 'B002PN2RA6', 'B01HEGK5TI', 'B00OVTC4VE', 'B00LFIQQUY', 'B00JRVW7F4', 'B0067VNG6Q', 'B00KIXW42Q', 'B0012QWACW']
    item_property = item_property[~item_property['asin'].isin(stopItem)] 
    

def is_number(s):
    
    try:
        float(s)
        return True
    except ValueError:
        pass
    if '−' in  s:
        return True
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
 
    return False


def reg_price(price):
    if is_number(price):
        return  float(price)
    if '−' in  price:
        price_list = price.replace(' ', '').split('−')
        results = (float(price_list[0]) + float(price_list[1]))/2
        return results
    price_num = price.replace('$','')
    if is_number(price_num):
        results = float(price_num)
    else:
        results = ''
    return results

def reg_category(cate):
#     将倒数第一个作为item类别
    results = ''
    if isinstance(cate,list):
        if len(cate) == 0:
            results = ''
        else:
            results = cate[-1]
    else:
        results = ''
    return results

def get_text(title):
    # text = title 
    results = ''
    if title != '':
        results=[]
        results.append(title)
    return results

def get_image(image):
    results = ''
    if isinstance(image,list):
        if len(image) != 0:
            results = image[0]
    return results

item_property['price_num'] = item_property.price.map(reg_price)
item_property['cate'] = item_property.category.map(reg_category)



item_property['text'] = item_property.apply(lambda row: get_text(row['title']), axis=1)
item_property['image'] = item_property.apply(lambda row: get_image(row['imageURLHighRes']), axis=1)



# delete items without corresponding attributes
item_property = item_property[(item_property['price_num'] != '')]
item_property = item_property[(item_property['cate'] != '' )]
item_property = item_property[item_property['text'] != '']
item_property = item_property[(item_property['image'] != '' )]

item_property[['price_num']] = item_property[['price_num']].astype(float)

item_property.drop_duplicates(subset=['asin'],keep='first',inplace=True)


item_data = item_property[['asin', 'price_num', 'cate', 'image', 'text']]

# 各个类别商品数量
group_cate_num = pd.DataFrame(item_data.groupby(item_data['cate']).count())
group_num=group_cate_num.reset_index()[['cate','asin']].rename(columns={'asin':'count'})
#  各个类别商品最低价
group_cate_min = pd.DataFrame(item_data['price_num'].groupby(item_data['cate']).min())
group_min=group_cate_min.reset_index()[['cate','price_num']].rename(columns={'price_num':'min'})
# 各个类别商品最高价
group_cate_max = pd.DataFrame(item_data['price_num'].groupby(item_data['cate']).max())
group_max=group_cate_max.reset_index()[['cate','price_num']].rename(columns={'price_num':'max'})
# 各个类别商品均价
group_cate_mean = pd.DataFrame(item_data['price_num'].groupby(item_data['cate']).mean())
group_mean=group_cate_mean.reset_index()[['cate','price_num']].rename(columns={'price_num':'mean'})
# 各个类别商品标准差
group_cate_std = pd.DataFrame(item_data['price_num'].groupby(item_data['cate']).std())
group_std=group_cate_std.reset_index()[['cate','price_num']].rename(columns={'price_num':'std'})

item_data1 = pd.merge(item_data, group_num, how='left', on = 'cate')
item_data2 = pd.merge(item_data1, group_min, how='left', on = 'cate')
item_data3 = pd.merge(item_data2, group_max, how='left', on = 'cate')

item_data4 = pd.merge(item_data3, group_mean, how='left', on = 'cate')
item_data = pd.merge(item_data4, group_std, how='left', on = 'cate')

# 删除所在类别包含item数量少于5的item，冷启动情况下，加入这些items
item_data = item_data[item_data['count']>4]

# 删除类别下价格异常商品
item_data = item_data[item_data['std']!=0]

item_data = item_data[['asin', 'price_num', 'cate', 'image','text', 'min', 'max', 'mean', 'std']]

import math
def get_price_level(price, p_min, p_max):
    fenzi = price - p_min
    fenmu = p_max - p_min
    if fenmu == 0 or price==0:
        return -1
    results = int(fenzi / fenmu * price_level_num) + 1
    return results

def logistic(t, u, s):
    gama = s * 3**(0.5) / math.pi
    results = 1 / (1 + math.exp((t-u)/gama))
    return results

item_data['price_level'] = item_data.apply(lambda row: get_price_level(row['price_num'], row['min'], row['max']), axis=1)
item_final = item_data[item_data['price_level'] != -1]
item_final = item_final[['asin', 'price_num', 'cate', 'image','text', 'price_level']]
item_final = item_final.reset_index(drop=True)


In [None]:

# 记录item-imageURL item-text 对应关系, 完整的id_list, 未经过后续过滤，冗余item较多
# dict item_id: image_url
id2image = {}
# dict item_id: text
id2text = {}

for _, row in item_final.iterrows():
    if row['asin'] not in id2image:
        id2image[row['asin']] = row['image']
    if row['asin'] not in id2text:
        id2text[row['asin']] = row['text']

dict_path = './dict/' + meta_dataset
if not os.path.exists(dict_path):
    os.makedirs(dict_path)
dict_ID2text = dict_path + '/id2text.csv'
id2text_key_list = list(id2text.keys())
id2text_val_list = list(id2text.values())
id2text_dict = pd.DataFrame({'asin':id2text_key_list, 'text':id2text_val_list})
id2text_dict.to_csv(dict_ID2text)

# id2image = {'4639725043': 'https://images-na.ssl-images-amazon.com/images/I/31V3NfjNACL.jpg'}
# id2text = {'4639725043': [text]}
dict_path = './dict/' + meta_dataset    
dict_save_path = dict_path + '/item_final.csv'
dict_image_save = dict_path + '/id2imageURL.npy'
dict_text_save = dict_path + '/id2text.npy'
if not os.path.exists(dict_path):
    os.makedirs(dict_path)
item_final.to_csv(dict_save_path)
np.save(dict_image_save, id2image)
np.save(dict_text_save, id2text)
print("dataset: ", meta_dataset, "save the meta and dict")
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

### 执行以下代码，融合用户交互数据
### processing image&text at pycharm
### downloading image

### 用户交互数据与item特征数据进行融合

In [7]:
import gzip
from collections import defaultdict
from datetime import datetime
import os
import time
import pickle
import pandas as pd
import json as json
import numpy as np

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

meta_dataset = 'Clothing_Shoes_and_Jewelry'
datasets_name = meta_dataset
interaction_path = './originalData/' + datasets_name + '_5.json.gz'
# df = getDF('originalData/Tools_and_Home_Improvement.json.gz')
df = getDF(interaction_path)
# 用户商品交互数据, userID-itemID-overall-timestamp
time_interval = 60*60*24

temp_iter = df[['reviewerID', 'asin', 'unixReviewTime']]

def merge_review_time(review, time):
    return str(review)+str(time)

temp_iter['review_time'] = temp_iter.apply(lambda row: merge_review_time(row['reviewerID'], row['unixReviewTime']), axis=1)
temp_iter = temp_iter[['review_time', 'asin', 'unixReviewTime']]
interaction=temp_iter.rename(columns={'review_time':'reviewerID'})
interaction.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,reviewerID,asin,unixReviewTime
0,A180LQZBUWVOLF1433289600,32034,1433289600
1,ATMFGKU5SVEYY1427846400,32034,1427846400
2,A1QE70QBJ8U6ZG1421107200,32034,1421107200
3,A22CP6Z73MZTYU1419292800,32034,1419292800
4,A22L28G8NRNLLN1418601600,32034,1418601600
5,A5N0QU8JBRVQQ1215216000,899332757,1215216000
6,ANGLIGXJHXC6D1486425600,899332757,1486425600
7,ASAE9FBGAJQAH1482796800,899332757,1482796800
8,A1TAW91SQSU2AH1481500800,899332757,1481500800
9,A1SVGO4GMEOXQ01471046400,899332757,1471046400


In [8]:
item_path = './dict/' + meta_dataset + '/item_final.csv'
item_final =  pd.read_csv(item_path)
item_final = item_final[['asin', 'price_level', 'cate']]


user_item1 = pd.merge(interaction, item_final, how='left', on = 'asin')
user_item2 = user_item1.dropna(axis=0)


user_item2.sort_values(by=["reviewerID","unixReviewTime"],inplace=True,ascending=[True,True])
user_click_num = pd.DataFrame(user_item2.groupby(user_item2['reviewerID']).count())
click_num=user_click_num.reset_index()[['reviewerID','asin']].rename(columns={'asin':'click_num'})
item_data6 = pd.merge(user_item2, click_num, how='left', on = 'reviewerID')
# 删除长度为1的session
item_data7 = item_data6[item_data6['click_num']>1]
data = item_data7[['reviewerID', 'asin', 'unixReviewTime', 'price_level', 'cate']]
data.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,reviewerID,asin,unixReviewTime,price_level,cate
1,A0001528BGUBOEVR6T5U1410480000,B002M4110K,1410480000,88.0,Cleaning Tools
2,A0001528BGUBOEVR6T5U1410480000,B004I7D1WA,1410480000,5.0,Trailer Winches
3,A0001528BGUBOEVR6T5U1439337600,B0009PUQ8M,1439337600,28.0,Footprints
4,A0001528BGUBOEVR6T5U1439337600,B0034XCI82,1439337600,1.0,Chairs
5,A0001528BGUBOEVR6T5U1439337600,B004E4AVY8,1439337600,2.0,Family Camping Tents
13,A0029274J35Q1MYNKUWO1402876800,B001OLU4UA,1402876800,5.0,Rifle Cases
14,A0029274J35Q1MYNKUWO1402876800,B007IWZO54,1402876800,16.0,Gun Slings
30,A0080055DD81KOGDOWPS1488844800,B001OEJUU2,1488844800,11.0,Fixed Blade Hunting Knives
31,A0080055DD81KOGDOWPS1488844800,B00P2BB2PE,1488844800,9.0,Multifunction Tools
32,A0080055DD81KOGDOWPS1488844800,B00WPCR0N6,1488844800,2.0,Safety & Survival


In [9]:

data2 = data

user_click_num2 = pd.DataFrame(data2.groupby(data2['reviewerID']).count())
click_num2=user_click_num2.reset_index()[['reviewerID','asin']].rename(columns={'asin':'click_num'})
data3 = pd.merge(data2, click_num2, how='left', on = 'reviewerID')
data4 = data3[data3['click_num']>1]



save_path = './dict/' + meta_dataset +  '/interaction.csv'
data4.to_csv(save_path)
print("dataset: ", meta_dataset)
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

dataset:  Sports_and_Outdoors
2022-09-28 17:05:50
