In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Clean BooksData

In [5]:
books_df = pd.read_csv('data/books_data.csv')
books_df.head()

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
0,Its Only Art If Its Well Hung!,,['Julie Strain'],http://books.google.com/books/content?id=DykPA...,http://books.google.nl/books?id=DykPAAAACAAJ&d...,,1996,http://books.google.nl/books?id=DykPAAAACAAJ&d...,['Comics & Graphic Novels'],
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],http://books.google.com/books/content?id=IjvHQ...,http://books.google.nl/books?id=IjvHQsCn_pgC&p...,A&C Black,2005-01-01,http://books.google.nl/books?id=IjvHQsCn_pgC&d...,['Biography & Autobiography'],
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],http://books.google.com/books/content?id=2tsDA...,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,,2000,http://books.google.nl/books?id=2tsDAAAACAAJ&d...,['Religion'],
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],http://books.google.com/books/content?id=aRSIg...,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,iUniverse,2005-02,http://books.google.nl/books?id=aRSIgJlq6JwC&d...,['Fiction'],
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],,http://books.google.nl/books?id=399SPgAACAAJ&d...,,2003-03-01,http://books.google.nl/books?id=399SPgAACAAJ&d...,,


In [12]:
# clean books_df
cleaned_books = books_df.copy()
cleaned_books = cleaned_books.drop(columns=['publisher', 'image', 'previewLink', 'infoLink', 'ratingsCount'])


In [13]:
# Check for duplicates in title
cleaned_books['Title'].is_unique

True

In [14]:
cleaned_books

Unnamed: 0,Title,description,authors,publishedDate,categories
0,Its Only Art If Its Well Hung!,,['Julie Strain'],1996,['Comics & Graphic Novels']
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],2005-01-01,['Biography & Autobiography']
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],2000,['Religion']
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],2005-02,['Fiction']
4,"Nation Dance: Religion, Identity and Cultural ...",,['Edward Long'],2003-03-01,
...,...,...,...,...,...
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],2000-06-01,['Juvenile Fiction']
212400,Red Boots for Christmas,Everyone in the village of Friedensdorf is hap...,,1995,['Juvenile Fiction']
212401,Mamaw,"Give your Mamaw a useful, beautiful and though...",['Wild Wild Cabbage'],2018-01-17,
212402,The Autograph Man,Alex-Li Tandem sells autographs. His business ...,['Zadie Smith'],2003-08-12,['Fiction']


In [42]:
cleaned_books['categories'].value_counts()

# clean the column; structured as ['category'], change to just category
replacement_map = {
    '[': '',
    ']': '',
    "'": '',
    ",": ''
}
cleaned_books['categories'] = cleaned_books['categories'].replace('[\[\]\',]', '', regex=True)

In [43]:
category_counts = cleaned_books['categories'].value_counts()
# convert categories to lowercase
cleaned_books['categories'] = cleaned_books['categories'].str.lower()
# keep only categories with more than 2 books
categories_keep = category_counts[category_counts > 50].index

In [44]:
categories_keep

Index(['fiction', 'religion', 'history', 'juvenile fiction',
       'biography & autobiography', 'business & economics', 'computers',
       'social science', 'juvenile nonfiction', 'science',
       ...
       'crime', 'encyclopedias and dictionaries', 'adventure and adventurers',
       'christmas stories', 'india', 'dungeons and dragons (game)',
       'aeronautics', 'american wit and humor pictorial', 'americans',
       'buddhism'],
      dtype='object', name='categories', length=136)

In [45]:
cleaned_books = cleaned_books[cleaned_books["categories"].isin(categories_keep)]

In [61]:
rename_map = {
    'Title': 'title',
    'description': 'description',
    'authors': 'authors',
    'publishedDate': 'published_date',
    'categories': 'categories',
}
cleaned_books.rename(columns=rename_map, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_books.rename(columns=rename_map, inplace=True)


In [62]:
cleaned_books

Unnamed: 0,title,description,authors,published_date,categories
0,Its Only Art If Its Well Hung!,,['Julie Strain'],1996,comics & graphic novels
1,Dr. Seuss: American Icon,Philip Nel takes a fascinating look into the k...,['Philip Nel'],2005-01-01,biography & autobiography
2,Wonderful Worship in Smaller Churches,This resource includes twelve principles in un...,['David R. Ray'],2000,religion
3,Whispers of the Wicked Saints,Julia Thomas finds her life spinning out of co...,['Veronica Haddon'],2005-02,fiction
5,The Church of Christ: A Biblical Ecclesiology ...,In The Church of Christ: A Biblical Ecclesiolo...,['Everett Ferguson'],1996,religion
...,...,...,...,...,...
212397,The Magic of the Soul: Applying Spiritual Powe...,"""The Magic of the Soul, Applying Spiritual Pow...",['Patrick J. Harbula'],2002-09-01,body mind & spirit
212398,Autodesk Inventor 10 Essentials Plus,Autodesk Inventor 2017 Essentials Plus provide...,"['Daniel Banach', 'Travis Jones']",2016-03,computers
212399,The Orphan Of Ellis Island (Time Travel Advent...,"During a school trip to Ellis Island, Dominick...",['Elvira Woodruff'],2000-06-01,juvenile fiction
212400,Red Boots for Christmas,Everyone in the village of Friedensdorf is hap...,,1995,juvenile fiction


In [63]:
# write out to cleaned_data
import os
if not os.path.exists('cleaned_data'):
    os.makedirs('cleaned_data')
cleaned_books.to_csv('cleaned_data/cleaned_books.csv', index=False)

# Clean the ratings 

In [52]:
ratings_df = pd.read_csv('data/Books_rating.csv')
ratings_df.head()

In [54]:
ratings_df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [57]:
cleaned_ratings = ratings_df.copy().drop(columns=['Id', 'User_id', 'profileName', 'review/time'])

In [66]:
# rename columns
rename_map = {
    'Title': 'title',
    'Price': 'price',
    'review/helpfulness': 'review_helpfulness',
    'review/score': 'review_score',
    'review/summary': 'summary',
    'review/text': 'text'
}
cleaned_ratings = cleaned_ratings.rename(columns=rename_map)
cleaned_ratings

Unnamed: 0,title,price,helpfulness,score,summary,text
0,Its Only Art If Its Well Hung!,,7/7,4.0,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,Dr. Seuss: American Icon,,10/10,5.0,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Dr. Seuss: American Icon,,10/11,5.0,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Dr. Seuss: American Icon,,7/7,4.0,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Dr. Seuss: American Icon,,3/3,4.0,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
...,...,...,...,...,...,...
2999995,The Idea of History,,14/19,4.0,Difficult,"This is an extremely difficult book to digest,..."
2999996,The Idea of History,,1/1,4.0,Quite good and ahead of its time occasionally,This is pretty interesting. Collingwood seems ...
2999997,The Idea of History,,0/0,4.0,Easier reads of those not well versed in histo...,"This is a good book but very esoteric. ""What i..."
2999998,The Idea of History,,1/11,5.0,"Yes, it is cheaper than the University Bookstore","My daughter, a freshman at Indiana University,..."


In [68]:
# Keep rows where title is in cleaned_books
cleaned_ratings = cleaned_ratings[cleaned_ratings['title'].isin(cleaned_books['title'])]
cleaned_ratings.reset_index(inplace=True, drop=True)

In [69]:
cleaned_ratings

Unnamed: 0,title,price,helpfulness,score,summary,text
0,Its Only Art If Its Well Hung!,,7/7,4.0,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,Dr. Seuss: American Icon,,10/10,5.0,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Dr. Seuss: American Icon,,10/11,5.0,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Dr. Seuss: American Icon,,7/7,4.0,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Dr. Seuss: American Icon,,3/3,4.0,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...
...,...,...,...,...,...,...
2193369,The Idea of History,,14/19,4.0,Difficult,"This is an extremely difficult book to digest,..."
2193370,The Idea of History,,1/1,4.0,Quite good and ahead of its time occasionally,This is pretty interesting. Collingwood seems ...
2193371,The Idea of History,,0/0,4.0,Easier reads of those not well versed in histo...,"This is a good book but very esoteric. ""What i..."
2193372,The Idea of History,,1/11,5.0,"Yes, it is cheaper than the University Bookstore","My daughter, a freshman at Indiana University,..."


In [70]:
# export
cleaned_ratings.to_csv('cleaned_data/cleaned_ratings.csv', index=False)