In [31]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


### Load dataset

In [32]:

# Load dataset from Kaggle
dataset_path = kagglehub.dataset_download("elvinrustam/books-dataset")

# Print the path where files are stored
print("Dataset downloaded to:", dataset_path)


Dataset downloaded to: C:\Users\afraa\.cache\kagglehub\datasets\elvinrustam\books-dataset\versions\3


In [33]:

# Find the CSV file in the downloaded directory
csv_file = [f for f in os.listdir(dataset_path) if f.endswith(".csv")][0]

# Load the dataset
df = pd.read_csv(os.path.join(dataset_path, csv_file))

# Display the first few rows
df.head()


Unnamed: 0,Title,Authors,Description,Category,Publisher,Publish Date,Price
0,Goat Brothers,"By Colton, Larry",,"History , General",Doubleday,"Friday, January 1, 1993",Price Starting at $8.79
1,The Missing Person,"By Grumbach, Doris",,"Fiction , General",Putnam Pub Group,"Sunday, March 1, 1981",Price Starting at $4.99
2,Don't Eat Your Heart Out Cookbook,"By Piscatella, Joseph C.",,"Cooking , Reference",Workman Pub Co,"Thursday, September 1, 1983",Price Starting at $4.99
3,When Your Corporate Umbrella Begins to Leak: A...,"By Davis, Paul D.",,,Natl Pr Books,"Monday, April 1, 1991",Price Starting at $4.99
4,Amy Spangler's Breastfeeding : A Parent's Guide,"By Spangler, Amy",,,Amy Spangler,"Saturday, February 1, 1997",Price Starting at $5.32


### Inspect data

In [34]:
df.shape

(103082, 7)

In [35]:
df.describe()

Unnamed: 0,Title,Authors,Description,Category,Publisher,Publish Date,Price
count,103082,103082,70213,76912,103074,103082,103082
unique,97818,63580,68831,3106,13029,956,1387
top,The Nutcracker,By,For Ingest Only - Data needs to be cleaned up ...,"Fiction , General",Simon & Schuster,"Thursday, January 1, 2004",Price Starting at $5.29
freq,12,1043,30,2549,1521,868,41876


### Handle missing data

In [36]:
df.isnull().sum()

Title               0
Authors             0
Description     32869
Category        26170
Publisher           8
Publish Date        0
Price               0
dtype: int64

### Checking for duplicates

In [None]:
print(f"Number of duplicate rows: {df.duplicated().sum()}")

Number of duplicate rows: 8


In [38]:
#Print duplicate occurances
df[df.duplicated(keep=False)]

Unnamed: 0,Title,Authors,Description,Category,Publisher,Publish Date,Price
21438,Love You Forever,"By Munsch, Robert N. and McGraw, Sheila",A young woman holds her newborn sonAnd looks a...,"Juvenile Fiction , Social Themes , Emotions &...",Firefly Books,"Friday, September 1, 1995",Price Starting at $4.99
26545,Organized to Be the Best!: New Timesaving Ways...,"By Silver, Susan",Provides a practical approach to time manageme...,"Business & Economics , General",Adams Hall Pub,"Sunday, October 1, 1995",Price Starting at $5.29
27034,The Germans,"By Craig, Gordon A.",,,Plume,"Friday, April 1, 1983",Price Starting at $5.29
27624,Organized to Be the Best!: New Timesaving Ways...,"By Silver, Susan",Provides a practical approach to time manageme...,"Business & Economics , General",Adams Hall Pub,"Sunday, October 1, 1995",Price Starting at $5.29
30120,Love You Forever,"By Munsch, Robert N. and McGraw, Sheila",A young woman holds her newborn sonAnd looks a...,"Juvenile Fiction , Social Themes , Emotions &...",Firefly Books,"Friday, September 1, 1995",Price Starting at $4.99
40770,The Prophet,By Kahlil Gibran,,,Senate,"Wednesday, January 1, 2003",Price Starting at $5.29
50350,The Prophet,By Kahlil Gibran,,,Senate,"Wednesday, January 1, 2003",Price Starting at $5.29
67074,"The Secret (Seasons of Grace, Book 1)","By Lewis, Beverly",In the seemingly ordinary Amish home of Grace ...,"Fiction , Christian , General",Bethany House Publishers,"Sunday, March 1, 2009",Price Starting at $5.29
67420,The Germans,"By Craig, Gordon A.",,,Plume,"Friday, April 1, 1983",Price Starting at $5.29
69075,"The Secret (Seasons of Grace, Book 1)","By Lewis, Beverly",In the seemingly ordinary Amish home of Grace ...,"Fiction , Christian , General",Bethany House Publishers,"Sunday, March 1, 2009",Price Starting at $5.29


In [39]:
#Print duplicate rows to be removed
df[df.duplicated()]


Unnamed: 0,Title,Authors,Description,Category,Publisher,Publish Date,Price
27624,Organized to Be the Best!: New Timesaving Ways...,"By Silver, Susan",Provides a practical approach to time manageme...,"Business & Economics , General",Adams Hall Pub,"Sunday, October 1, 1995",Price Starting at $5.29
30120,Love You Forever,"By Munsch, Robert N. and McGraw, Sheila",A young woman holds her newborn sonAnd looks a...,"Juvenile Fiction , Social Themes , Emotions &...",Firefly Books,"Friday, September 1, 1995",Price Starting at $4.99
50350,The Prophet,By Kahlil Gibran,,,Senate,"Wednesday, January 1, 2003",Price Starting at $5.29
67420,The Germans,"By Craig, Gordon A.",,,Plume,"Friday, April 1, 1983",Price Starting at $5.29
69075,"The Secret (Seasons of Grace, Book 1)","By Lewis, Beverly",In the seemingly ordinary Amish home of Grace ...,"Fiction , Christian , General",Bethany House Publishers,"Sunday, March 1, 2009",Price Starting at $5.29
85329,ACSM's Guidelines for Exercise Testing and Pre...,By American College of Sports Medicine (COR),The single most internationally read and refer...,"Medical , Sports Medicine",Lippincott Williams & Wilkins,"Sunday, February 1, 2009",Price Starting at $5.29
88713,The Last Song,"By Sparks, Nicholas",From the author of A Walk to Remember comes a ...,"Fiction , Family Life , General",Grand Central Publishing,"Monday, February 1, 2010",Price Starting at $5.29
97094,Sidney Sheldon's After the Darkness,"By Bagshawe, Tilly",Author Tillie Bagshawe brilliantly recaptured ...,"Fiction , Thrillers , Suspense",Harper,"Wednesday, December 1, 2010",Price Starting at $5.29


In [40]:
#Drop duplicates
df.drop_duplicates(inplace=True)


In [41]:
#Check data types
df.dtypes

Title           object
Authors         object
Description     object
Category        object
Publisher       object
Publish Date    object
Price           object
dtype: object

### Drop irrelevant columns

In [42]:
df = df.drop(['Publish Date', 'Price'], axis=1)
df.head(5)

Unnamed: 0,Title,Authors,Description,Category,Publisher
0,Goat Brothers,"By Colton, Larry",,"History , General",Doubleday
1,The Missing Person,"By Grumbach, Doris",,"Fiction , General",Putnam Pub Group
2,Don't Eat Your Heart Out Cookbook,"By Piscatella, Joseph C.",,"Cooking , Reference",Workman Pub Co
3,When Your Corporate Umbrella Begins to Leak: A...,"By Davis, Paul D.",,,Natl Pr Books
4,Amy Spangler's Breastfeeding : A Parent's Guide,"By Spangler, Amy",,,Amy Spangler


In [43]:
# Select the first 10000 entries
df_subset = df.head(10000)

In [44]:
# Chage to file location 

df_subset.to_csv("E:/Projects/Book_Recommender/book_data.csv", index=False)