# Data Cleaning

In [2]:
import numpy as np, pandas as pd
import os
from sklearn.utils import shuffle
import joblib



## Load data

In [3]:
ratings = pd.read_csv( 'goodbooks-10k/ratings.csv' )
to_read = pd.read_csv( 'goodbooks-10k/to_read.csv' )
books = pd.read_csv( 'goodbooks-10k/books.csv' )

tags = pd.read_csv( 'goodbooks-10k/tags.csv' )
book_tags = pd.read_csv( 'goodbooks-10k/book_tags.csv')

In [4]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
to_read.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


In [6]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


## Check if there are books with multiple authors

In [7]:
books[books.authors.apply(lambda x: "," in x)].head(1)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...


In [8]:
books.columns

Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')

In [9]:
books["original_title title".split()].head()

Unnamed: 0,original_title,title
0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)"
1,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...
2,Twilight,"Twilight (Twilight, #1)"
3,To Kill a Mockingbird,To Kill a Mockingbird
4,The Great Gatsby,The Great Gatsby


## Deal with nans in orignal_title

In [10]:
books[books.original_title.isna()]["original_title title".split()].head()

Unnamed: 0,original_title,title
74,,"Bridget Jones's Diary (Bridget Jones, #1)"
142,,All the Light We Cannot See
209,,"Vampire Academy (Vampire Academy, #1)"
214,,Ready Player One
256,,Alice in Wonderland


In [11]:
def deal_nan(x, y):
    if x is np.nan:
        if "(" not in y:
            return y
        return y[:y.index("(")].strip()
    return x
books["original_title"] = books["original_title title".split()]. \
                        apply(lambda x: deal_nan(*x), axis=1)

In [12]:
np.nan in books["original_title"]

False

## Save to a new file for future use

In [19]:
books["book_id authors original_title isbn13".split()].rename(columns={"original_title": "title"}).to_csv("goodbooks-10k/books_cleaned.csv", index=False)

# DONE WITH DATA CLEANNING