In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [10]:
# read csv file
reader_df = pd.read_csv('../dataset/validate_data_with_bert.csv')

In [11]:
reader_df

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,description,authors,publisher,publishedDate,categories,ratingsCount,bert_score
0,0826414346,dr. seuss: american icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography'],,5
1,0826414346,dr. seuss: american icon,,A14OJS0VWMOSWO,Midwest Book Review,3/4,5.0,1100131200,A memorably excellent survey of Dr. Seuss' man...,Theodor Seuss Giesel was best known as 'Dr. Se...,Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography'],,5
2,0963923080,rising sons and daughters: life among japan's ...,,A3NIQK6ZLYEP1L,Michael Valdivielso,0/0,4.0,1239667200,Almost a day by day view,Steven Wardell went to Japan without reading a...,Wardell recalls his experience as a foreign st...,['Steven Wardell'],Plympton PressIntl,1995,['Social Science'],,4
3,0854968350,muslim women's choices: religious belief and s...,,ATDE9JYCPI0L1,Alyssa A. Lappen,0/0,2.0,1109808000,Oh dear,I was excited to find a book ostensibly about ...,Counters the Western views and stereotypes of ...,"['Camillia Fawzi El-Solh', 'Judy Mabro']",Berg Pub Limited,1994-02-17,['Religion'],,1
4,157067051X,voices from the farm: adventures in community ...,,A1ER5AYS3FQ9O3,"K. Corn ""reviewer""",6/7,5.0,1160870400,Intense memoir told in brief segments by forme...,This book gave me a glimpse at life on The Far...,"Twenty-five years ago, at the height of the co...",['Rupert Fike'],Book Publishing Company,2012-08-21,['Biography & Autobiography'],1.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86131,0425200736,echoes (berkley sensation),,AFVQZQ8PW0L,Harriet Klausner,4/5,5.0,1099180800,Engaging romantic suspense,"Mountain Bend, California elementary school pr...","Grace Peterson is desperate, in hiding, and on...",['Maya Banks'],Penguin,2012-07-03,['Fiction'],13.0,4
86132,0786182431,very bad deaths: library edition,90.0,A2CR57GAJKNWVV,"booksforabuck ""BooksForABuck""",6/8,4.0,1106956800,Thoughtful and Powerful,Russell Walker wants to be left alone to die--...,This seven volume set reissues a collection of...,['Various Authors'],Routledge,2021-05-13,['Social Science'],,5
86133,0786182431,very bad deaths: library edition,90.0,A1I2O9Y3X3HXLS,Arthur W. Jordin,1/1,5.0,1230595200,The Perils of Telepathy,Very Bad Deaths (2004) is the first SF novel i...,This seven volume set reissues a collection of...,['Various Authors'],Routledge,2021-05-13,['Social Science'],,4
86134,0786182431,very bad deaths: library edition,90.0,AFVQZQ8PW0L,Harriet Klausner,5/7,5.0,1103846400,fantastic paranormal serial killer tale,"Fiftyish Canadian Russell Walker writes ""The F...",This seven volume set reissues a collection of...,['Various Authors'],Routledge,2021-05-13,['Social Science'],,5


In [12]:
reader_df = reader_df.drop(['Price', 'profileName', 'review/summary', 'review/text', \
                                                      'description', 'authors', 'publisher'], axis = 1)
# get year from publishedDate
reader_df['publishedDate'] = reader_df['publishedDate'].str.split('-').str[0]
# fill the missing value with the most frequent year
reader_df['publishedDate'] = reader_df['publishedDate'].fillna(reader_df['publishedDate'].value_counts().index[0])
# handle the year with ?, replace with the most frequent year of that century (19xx)
reader_df.loc[reader_df['publishedDate'].str.find("?") != -1, 'publishedDate'] = reader_df.loc[reader_df['publishedDate'].str[:2] == '19', 'publishedDate'].value_counts().index[0]
# remove the * sign in the publishedDate
reader_df['publishedDate'] = reader_df['publishedDate'].str.replace('*', '')
reader_df['publishedDate'] = reader_df['publishedDate'].astype(int)

# one-hot encoding for book category
reader_df['categories'] = reader_df['categories'].str.replace('[', '').str.replace(']', '').str.replace("'", '')
# build category list
category_list = reader_df['categories'].unique().tolist()
reader_df = pd.get_dummies(reader_df, columns=['categories'])

# helpfulness ratio
reader_df['review/helpfulness'] = reader_df['review/helpfulness'].str.split('/').str[0].astype(float) / reader_df['review/helpfulness'].str.split('/').str[1].astype(float)
reader_df['review/helpfulness'] = reader_df['review/helpfulness'].fillna(0)

In [13]:
reader_id_list = reader_df['User_id'].unique().tolist()
train_reader_buying_list = {}
for reader in reader_id_list:
    train_reader_buying_list[reader] = [[], []]
    tmp_df = reader_df[reader_df['User_id'] == reader]
    train_reader_buying_list[reader][0] = tmp_df.sort_values(by=['review/score'], ascending=False)['Id'].tolist()
    train_reader_buying_list[reader][1] = tmp_df.sort_values(by=['review/score'], ascending=False)['review/score'].tolist()

In [14]:
reader_id_list = reader_df['User_id'].unique().tolist()
reader_buying_list = {}
attribute_dict = {
    'User_id': [],
    'avg_helpfulness': [],
    'start_buying_year': [],
    'end_buying_year': [],
}
for category in category_list:
    attribute_dict["ratio_" + category] = []
    attribute_dict["score_" + category] = []
for reader in reader_id_list:
    attribute_dict['User_id'].append(reader)
    tmp_df = reader_df[reader_df['User_id'] == reader]
    attribute_dict['avg_helpfulness'].append(tmp_df['review/helpfulness'].mean())
    attribute_dict['start_buying_year'].append(tmp_df['publishedDate'].min())
    attribute_dict['end_buying_year'].append(tmp_df['publishedDate'].max())
    reader_buying_list[reader] = [[], []]
    reader_buying_list[reader][0] = tmp_df.sort_values(by=['review/score'], ascending=False)['Title'].tolist()
    reader_buying_list[reader][1] = tmp_df.sort_values(by=['review/score'], ascending=False)['review/score'].tolist()
    for category in category_list:
        # ratio of buying books in each category
        attribute_dict["ratio_" + category].append(tmp_df['categories_' + category].sum() / tmp_df.shape[0])
        # average review score of buying books in each category
        score_mean = tmp_df[tmp_df['categories_' + category] == 1]['review/score'].mean()
        attribute_dict["score_" + category].append(0 if np.isnan(score_mean) else score_mean)

reader_df = pd.DataFrame(attribute_dict)
# normalize the data
reader_df['avg_helpfulness'] = (reader_df['avg_helpfulness'] - reader_df['avg_helpfulness'].min()) / (reader_df['avg_helpfulness'].max() - reader_df['avg_helpfulness'].min())
reader_df['start_buying_year'] = (reader_df['start_buying_year'] - reader_df['start_buying_year'].min()) / (reader_df['start_buying_year'].max() - reader_df['start_buying_year'].min())
reader_df['end_buying_year'] = (reader_df['end_buying_year'] - reader_df['end_buying_year'].min()) / (reader_df['end_buying_year'].max() - reader_df['end_buying_year'].min())
for category in category_list:
    reader_df["ratio_" + category] = (reader_df["ratio_" + category] - reader_df["ratio_" + category].min()) / (reader_df["ratio_" + category].max() - reader_df["ratio_" + category].min())
    reader_df["score_" + category] = (reader_df["score_" + category] - reader_df["score_" + category].min()) / (reader_df["score_" + category].max() - reader_df["score_" + category].min())

In [15]:
reader_df

Unnamed: 0,User_id,avg_helpfulness,start_buying_year,end_buying_year,ratio_Biography & Autobiography,score_Biography & Autobiography,ratio_Social Science,score_Social Science,ratio_Religion,score_Religion,...,ratio_Fiction,score_Fiction,ratio_Business & Economics,score_Business & Economics,ratio_Computers,score_Computers,ratio_Family & Relationships,score_Family & Relationships,ratio_Cooking,score_Cooking
0,A30TK6U7DNS82R,0.716382,0.958969,0.857143,0.484559,0.847273,0.023091,0.666667,0.010930,0.800000,...,0.527094,0.854206,0.005445,1.000000,0.010310,1.0,0.000000,0.000000,0.000000,0.000000
1,A14OJS0VWMOSWO,0.700399,0.937401,0.857143,0.196595,1.000000,0.075100,1.000000,0.106148,1.000000,...,0.226079,0.999606,0.108214,0.999091,0.057751,1.0,0.318962,1.000000,0.100491,1.000000
2,A3NIQK6ZLYEP1L,0.618646,0.937401,0.857143,0.083965,0.980000,0.051350,0.942857,0.005208,1.000000,...,0.690141,0.888435,0.005189,0.600000,0.000000,0.0,0.000000,0.000000,0.020518,1.000000
3,ATDE9JYCPI0L1,0.815510,0.984745,0.714286,0.160161,0.933333,0.093284,0.750000,0.215252,0.769231,...,0.089552,1.000000,0.032993,1.000000,0.000000,0.0,0.103856,1.000000,0.021743,1.000000
4,A1ER5AYS3FQ9O3,0.841288,0.991583,0.857143,0.436210,0.930000,0.057165,0.800000,0.000000,0.000000,...,0.439024,0.894444,0.053915,0.850000,0.000000,0.0,0.594004,0.971429,0.106594,0.933333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,A3TZV5SXQ8F6ST,0.628550,0.993161,0.857143,0.412722,0.900000,0.060096,1.000000,0.021334,1.000000,...,0.384615,0.760000,0.021255,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
814,A1LI7KZH6VHPAQ,0.849318,0.985797,0.714286,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.019231,1.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
815,A76FRFSYRN1TK,0.786917,0.991583,0.857143,0.156883,0.720000,0.000000,0.000000,0.038925,0.900000,...,0.736842,0.880952,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
816,A29E0Q47E8CPPO,0.592595,0.973172,0.714286,0.121252,0.900000,0.000000,0.000000,0.000000,0.000000,...,0.525424,0.793548,0.018733,1.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [16]:
reader_buying_list

{'A30TK6U7DNS82R': [['dr. seuss: american icon',
   'not without love: memoirs',
   'the l shaped room',
   'tilt-a-whirl',
   'dear people: remembering jonestown',
   'the hollow (winterbrook edition)',
   'secret anniversaries of the heart: new and selected stories by lev raphael',
   'finnegans wake',
   "a summer of faulkner: as i lay dying/the sound and the fury/light in august (oprah's book club)",
   'spark notes our town',
   'ultimate gay erotica 2005',
   'fiddlers',
   'you can say you knew me when',
   'swing: a mystery',
   'wodehouse: a life',
   'ariel',
   'a day, a night, another day, summer: stories',
   'the oracle: the lost secrets and hidden messages of ancient delphi',
   'doctor zhivago',
   'my life',
   'fiddlers (87th precinct mysteries)',
   'funny accent: a novel',
   'sgt. york: his life, legend & legacy: the remarkable untold story of sgt. alvin c. york',
   'murder in mesopotamia: a hercule poirot mystery',
   'a star over bethlehem and other stories',
  

In [17]:
with open('../dataset/reader_buying_list.pkl', 'wb') as f:
    pickle.dump(reader_buying_list, f)

In [8]:
# reader_df.to_csv('../dataset/reader_info.csv', index=False)