# Challenge
Clean data with Pandas

## Details:
Use the provided JSON dataset and complete the following:
1. Some string values may be, "" instead of null. Set these fields to NaN with numpy before you continue cleaning
2. Clamp publication year to the years of 1800-2026
3. Ensure average_rating, page_count, etc can't have impossible values
4. Normalize dates in "last_checkout"
5. Ensure genres, languages, formats, and publishers have proper capitalization
6. Find and remove duplicate books with the same title/author
7. Drop any rows with empty fields

In [22]:
import pandas as pd
import numpy as np

df = pd.read_json("../books_dirty.json")

# 1. replace empty strings with nan
df.replace("", np.nan, inplace=True)

# 3. ensure numeric values can't have impossible values
df["publication_year"] = pd.to_numeric(df["publication_year"], errors="coerce")
df["publication_year"] = df["publication_year"].fillna(0)

df["average_rating"] = pd.to_numeric(df["average_rating"], errors="coerce")
df["average_rating"] = df["average_rating"].fillna(0)
df["average_rating"] = df["average_rating"].clip(1, 5)

df["page_count"] = pd.to_numeric(df["page_count"], errors="coerce")
df["page_count"] = df["page_count"].fillna(0)
df["page_count"] = df["page_count"].clip(120, 1100)

df["ratings_count"] = pd.to_numeric(df["ratings_count"], errors="coerce")
df["ratings_count"] = df["ratings_count"].fillna(0)
df["ratings_count"] = df["ratings_count"].clip(25, 1000)

df["price_usd"] = pd.to_numeric(df["price_usd"], errors="coerce")
df["price_usd"] = df["price_usd"].fillna(0)
df["price_usd"] = df["price_usd"].clip(7.99, 14.99)

df["sales_millions"] = pd.to_numeric(df["sales_millions"], errors="coerce")
df["sales_millions"] = df["sales_millions"].fillna(0)
df["sales_millions"] = df["sales_millions"].clip(0.01, 15)

# 4. normalize last_checkout dates
df["last_checkout"] = df["last_checkout"].replace("N/A", np.nan)
df["last_checkout"] = pd.to_datetime(df["last_checkout"], format="mixed")

# 5. ensure genres, languages, formats, and publishers have proper capitalization (.title())
df["genre"] = df["genre"].str.title()
df["language"] = df["language"].str.title()
df["language"] = df["language"].replace("Eng", "English")
df["format"] = df["format"].str.title()
df["publisher"] = df["publisher"].str.title()

# 2. clamp publication year to 1800 - 2026
df["publication_year"] = df["publication_year"].fillna(0)
df["publication_year"] = df["publication_year"].clip(1800, 2026)

# 6. find and remove duplicate books with same title/author
df.drop_duplicates(subset=["title", "author"], inplace=True)

# 7. drop rows with empty fields
df.dropna(inplace=True)

df


Unnamed: 0,book_id,title,author,genre,publication_year,page_count,average_rating,ratings_count,price_usd,publisher,language,format,in_print,sales_millions,last_checkout,available
27,17cc0642-6e0d-4f86-9f92-21d511b61139,Book Title 4,Author 12,History,1800.0,120.0,1.0,837.0,7.99,Galactic Books,German,Ebook,False,0.01,2024-05-23 23:36:03.170880,False
28,1a4643c0-d1fe-4f0c-b4f9-57c8c0bbee3f,Book Title 13,Author 20,Technology,1948.0,120.0,1.0,1000.0,7.99,Old Tree Publishing,English,Audio Book,False,0.01,2021-06-05 23:36:03.170890,False
30,76a01131-ff65-457a-b062-51f9839c5097,Book Title 4,Author 3,Biography,1800.0,120.0,1.2,25.0,14.99,Old Tree Publishing,Spanish,Paperback,True,0.01,2024-03-17 23:36:03.170910,False
31,710b0c5c-98b9-4a0d-a933-daa5136948b0,Book Title 7,Author 18,Technology,1800.0,621.0,1.0,1000.0,7.99,Sunshine Media,English,Audiobook,True,8.09,2024-11-27 23:36:03.170920,False
39,67b45a87-9689-457b-9503-91effe9aa1ab,Book Title 3,Author 25,History,1800.0,120.0,1.0,25.0,7.99,Old Tree Publishing,Spanish,Hardcover,False,0.01,2021-10-20 23:36:03.170999,True
45,cdb25f3b-1fb6-46b9-9398-fb5bc610f590,Book Title 15,Author 27,Science Fiction,1800.0,120.0,1.0,1000.0,14.99,Sunshine Media,German,Audio Book,False,0.01,2024-05-06 23:36:03.171058,True
50,d016f92b-e86c-4d42-9baa-8110c9dd1fd1,Book Title 10,Author 1,Mystery,1968.0,120.0,1.0,25.0,7.99,Galactic Books,English,Hardcover,True,0.01,2021-09-27 23:36:03.171109,False
108,08eb3066-4a85-4799-95ca-84f2e469c439,Book Title 13,Author 25,Technology,1842.0,996.0,3.19,25.0,14.99,Old Tree Publishing,German,Audio Book,True,0.01,2025-01-09 23:36:03.171689,False
118,c014f91d-6e30-4ad2-bd8a-50991967a007,Book Title 10,Author 29,Technology,1800.0,120.0,1.0,1000.0,7.99,Old Tree Publishing,German,Hardcover,True,0.01,2024-07-04 23:36:03.171789,False
133,a8fb4e12-4ad2-4502-b506-bf5265866964,Book Title 15,Author 17,Romance,1800.0,120.0,1.0,1000.0,7.99,Old Tree Publishing,German,Paperback,True,5.97,2025-02-15 23:36:03.172003,True
