In [58]:
import pandas as pd
from datetime import datetime

pd.options.mode.chained_assignment = None  # default='warn'

dfb = pd.read_json('C:/Users/patrick.walsh/workspace/GoodreadsScraper-1/book_.jl', lines=True)
dfr = pd.read_json('C:/Users/patrick.walsh/workspace/GoodreadsScraper-1/review_.jl', lines=True)
dfa = pd.read_json('C:/Users/patrick.walsh/workspace/GoodreadsScraper-1/author_.jl', lines=True)

Book Transformation

In [59]:
# Add prefix of URL to the URLs in Review dataset
url_prefix = 'https://www.goodreads.com'
dfr['url_full'] = url_prefix + dfr['url']

# Join Review and Book datasets on URL
dfbr = pd.merge(dfb, dfr, left_on='url', right_on='url_full', how='left')

In [60]:
# clean shelf values
shelf_dict = {"read" : 'Read', "currently-reading" : 'Currently Reading', "to-read": 'To Read'}
dfbr=dfbr.replace({"shelf": shelf_dict})
dfbr['shelf']

# count awards
dfbr['award_ct'] = 0
for i,r in dfbr.iterrows():
    if str(r['awards'])=='nan':
        pass
    else:
        dfbr['award_ct'][i] = len(r['awards'])

# series boolean
dfbr['series_bool'] = 'Standalone'
for i,r in dfbr.iterrows():
    if pd.isna(r['series']):
        pass
    else:
        dfbr['series_bool'][i] = 'Part of a series'

# controversiality
## column for each of the rating counts
dfbr['1_star_ratings'] = 0
dfbr['2_star_ratings'] = 0
dfbr['3_star_ratings'] = 0
dfbr['4_star_ratings'] = 0
dfbr['5_star_ratings'] = 0

for i,r in dfbr.iterrows():
    dfbr['1_star_ratings'][i] = dfbr['rating_histogram'][i]['1']
    dfbr['2_star_ratings'][i] = dfbr['rating_histogram'][i]['2']
    dfbr['3_star_ratings'][i] = dfbr['rating_histogram'][i]['3']
    dfbr['4_star_ratings'][i] = dfbr['rating_histogram'][i]['4']
    dfbr['5_star_ratings'][i] = dfbr['rating_histogram'][i]['5']

## ratio of 5 star ratings to 1 star ratings
dfbr['5_to_1_ratio'] = dfbr['5_star_ratings']/dfbr['1_star_ratings']

## ratio of 5 and 4 star ratings to 1 and 2 star ratings
dfbr['4_5_to_1_2_ratio'] = (dfbr['4_star_ratings']+dfbr['5_star_ratings'])/(dfbr['1_star_ratings']+dfbr['2_star_ratings'])

# my rating text to int
## create dictionary
rating_dict = {'did not like it': '1',
                   'it was ok': '2',
                   'liked it': '3',
                   'really liked it': '4',
                   'it was amazing': '5'}

## map values using the dictionary
dfbr['my_rating_int'] = 0
for i,r in dfbr.iterrows():
    if r['my_rating']:
        my_rating_str = str(r['my_rating'])
        dfbr['my_rating_int'][i] = rating_dict.get(my_rating_str)
    else:
        pass

# my rating versus average rating
rating_dif = []
for i,r in dfbr.iterrows():
    if r['my_rating_int']:
        my_rating_minus_avg = r['my_rating_int'] - r['avg_rating']
        rating_dif.append(my_rating_minus_avg)
    else:
        pass
dfbr['rating_dif'] = rating_dif

# remove ' 00:00:00' from publish_date, my_review_date_added, my_review_date_read
publish_date_new = []
my_review_date_added_new = []
my_review_date_read_new = []
for i,r in dfbr.iterrows():
    if type(r['publish_date']) != float:
        date_only = str(r['publish_date'][:10])
        publish_date_new.append(datetime.strptime(date_only, '%Y-%m-%d').date())
        #dfbr['publish_date_new'][i] = datetime.strptime(date_only, '%Y-%m-%d').date()
    else:
        publish_date_new.append(0)
        #pass

for i,r in dfbr.iterrows():
    if type(r['my_review_date_added']) != float:
        date_only = str(r['my_review_date_added'][:10])
        my_review_date_added_new.append(datetime.strptime(date_only, '%Y-%m-%d').date())
        #dfbr['my_review_date_added_new'][i] = datetime.strptime(date_only, '%Y-%m-%d').date()
    else:
        my_review_date_added_new.append(0)
        #pass

for i,r in dfbr.iterrows():
    if type(r['my_review_date_read']) != float:
        date_only = str(r['my_review_date_read'][:10])
        my_review_date_read_new.append(datetime.strptime(date_only, '%Y-%m-%d').date())
        #dfbr['my_review_date_read_new'][i] = datetime.strptime(date_only, '%Y-%m-%d').date()
    else:
        my_review_date_read_new.append(0)
        #pass

dfbr['publish_date_new'] = publish_date_new
dfbr['my_review_date_added_new'] = my_review_date_added_new
dfbr['my_review_date_read_new'] = my_review_date_read_new

# remove special characters from review text




Genre Dataset Creation

In [61]:
###first, create list of unique genres
df_gen = dfbr.explode('genres')
genre_set = set(list(df_gen['genres']))

###create a dataframe and generate a new field in it that has a count for each book that mentions that genre
dfg = pd.DataFrame(genre_set, columns=['genre'])
dfg.sort_values('genre')
dfg['book_count'] = 0
dfg = dfg.dropna()

for i,r in dfbr.iterrows():
    if r['genres']:
        #print(dfg['book_count'][0])
        book_genres = r['genres']
        if type(book_genres) != float:
            for i,r in dfg.iterrows():
                if pd.isna(r['genre']):
                    pass
                elif r['genre'] in book_genres:
                    #print(r['genre'], type(r['genre']))
                    dfg['book_count'][i] += 1
                else:
                    pass
        else:
            pass
    else:
        pass

Author Tranformation

In [62]:
# remove timestamp for birth and death date
birth_date_new = []
death_date_new = []
for i,r in dfa.iterrows():
    if type(r['birth_date']) != float:
        date_only = str(r['birth_date'][:10])
        birth_date_new.append(datetime.strptime(date_only, '%Y-%m-%d').date())
    else:
        birth_date_new.append(0)

for i,r in dfa.iterrows():
    if type(r['death_date']) != float:
        date_only = str(r['death_date'][:10])
        death_date_new.append(datetime.strptime(date_only, '%Y-%m-%d').date())
    else:
        death_date_new.append(0)

dfa['birth_date_new'] = birth_date_new
dfa['death_date_new'] = death_date_new

Output

In [63]:
dfbr = dfbr[['url_x', 'title', 'author', 'num_ratings', 'num_reviews', 'avg_rating',
       'num_pages', 'language', 'genres', 'awards',
       'characters', 'series',
       'isbn', 'isbn13', 'places', 'asin', 'my_review',
       'shelf', 'award_ct', 'series_bool', '1_star_ratings',
       '2_star_ratings', '3_star_ratings', '4_star_ratings', '5_star_ratings',
       '5_to_1_ratio', '4_5_to_1_2_ratio', 'my_rating_int', 'publish_date_new',
       'my_review_date_added_new', 'my_review_date_read_new', 'rating_dif', 'img_url']]

dfa = dfa[['url', 'name', 'genres', 'avg_rating', 'num_reviews', 'num_ratings', 'about', 
        'influences', 'birth_date_new', 'death_date_new', 'img_url', 'birthplace']]

In [65]:
# Output datasets to csvs

## books/reviews
dfbr.to_csv('book_review.csv')

## authors
dfa.to_csv('author.csv')

##genres
dfg.to_csv('genre.csv')
