In [4]:
import pandas as pd
import numpy as np

books = pd.read_csv("../data/raw/books.csv")
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


There are 23 columns, so I'll get rid of the ones we don't need in our analysis, also I will organize columns.


In [5]:
columns = ["book_id", "title", "original_title", "authors", "isbn", "average_rating", "ratings_count", "work_ratings_count", "work_text_reviews_count"]
books = books.loc[:, columns]
books.head()

Unnamed: 0,book_id,title,original_title,authors,isbn,average_rating,ratings_count,work_ratings_count,work_text_reviews_count
0,2767052,"The Hunger Games (The Hunger Games, #1)",The Hunger Games,Suzanne Collins,439023483,4.34,4780653,4942365,155254
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré",439554934,4.44,4602479,4800065,75867
2,41865,"Twilight (Twilight, #1)",Twilight,Stephenie Meyer,316015849,3.57,3866839,3916824,95009
3,2657,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,61120081,4.25,3198671,3340896,72586
4,4671,The Great Gatsby,The Great Gatsby,F. Scott Fitzgerald,743273567,3.89,2683664,2773745,51992


Everything looks good here, so we can look into other files from we need data to our model

In [6]:
tags = pd.read_csv('../data/raw/tags.csv')
help_table = pd.read_csv('../data/raw/book_tags.csv')

print(tags.head())
print(help_table.head())

   tag_id tag_name
0       0        -
1       1     --1-
2       2    --10-
3       3    --12-
4       4   --122-
   goodreads_book_id  tag_id   count
0                  1   30574  167697
1                  1   11305   37174
2                  1   11557   34173
3                  1    8717   12986
4                  1   33114   12716


To get tags which will help us in our recommendation model we need to merge those two dataframes:
  - help_table wit tags to get our tags, and sort them into list for every book
  - list of tags with books to create complete dataset

As a user of Goodreads I suppose that tags are just names of users shelves, so to avoid duplications at merge we need to get tags from one book into list. But before that I would like to delete unnecessary tags.

In [7]:
avg_popularity = np.mean(help_table['count'])

help_table = (
    help_table[help_table['count'] > avg_popularity]
    .rename(columns={'goodreads_book_id': 'book_id'})
    .drop(columns='count')
)

help_table = help_table.merge(tags, on='tag_id', how='left')
merged_tags = help_table.groupby('book_id')['tag_name'].apply(list).reset_index()
merged_tags.head()

Unnamed: 0,book_id,tag_name
0,1,"[to-read, fantasy, favorites, currently-readin..."
1,2,"[to-read, currently-reading, fantasy, favorite..."
2,3,"[to-read, favorites, fantasy, currently-readin..."
3,5,"[favorites, fantasy, currently-reading, young-..."
4,6,"[fantasy, young-adult, fiction, harry-potter, ..."


We can see that our lists of tags still has a variety of tags that won't help us (like to-read, books-i-own ect.) but for now I just leave this because I don't have any solution to get rid of those tags without dropping one-by-one.

Now let's just focus on merging book dataset with tags

In [8]:
dataset = books.merge(merged_tags, on='book_id', how='left')

dataset.head()

Unnamed: 0,book_id,title,original_title,authors,isbn,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,tag_name
0,2767052,"The Hunger Games (The Hunger Games, #1)",The Hunger Games,Suzanne Collins,439023483,4.34,4780653,4942365,155254,"[favorites, currently-reading, young-adult, fi..."
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré",439554934,4.44,4602479,4800065,75867,"[to-read, favorites, fantasy, currently-readin..."
2,41865,"Twilight (Twilight, #1)",Twilight,Stephenie Meyer,316015849,3.57,3866839,3916824,95009,"[young-adult, fantasy, favorites, vampires, ya..."
3,2657,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,61120081,4.25,3198671,3340896,72586,"[classics, favorites, to-read, classic, histor..."
4,4671,The Great Gatsby,The Great Gatsby,F. Scott Fitzgerald,743273567,3.89,2683664,2773745,51992,"[classics, favorites, fiction, classic, books-..."
