In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the book details from the CSV file
df = pd.read_csv('book_details.csv')

In [3]:
df.columns

Index(['type', 'title', 'authors', 'publish_date', 'source_records',
       'number_of_pages', 'publishers', 'isbn_10', 'isbn_13',
       'physical_format', 'full_title', 'covers', 'works', 'key',
       'lc_classifications', 'local_id', 'latest_revision', 'revision',
       'created', 'last_modified', 'identifiers', 'classifications',
       'description', 'publish_places', 'contributions', 'subjects',
       'languages', 'pagination', 'dewey_decimal_class', 'edition_name',
       'lccn', 'publish_country', 'by_statement', 'oclc_numbers', 'subtitle',
       'contributors', 'oclc_number', 'notes', 'first_sentence', 'ocaid',
       'weight', 'physical_dimensions', 'series', 'subject_people',
       'copyright_date', 'table_of_contents', 'coverimage', 'genres',
       'ia_box_id', 'ia_loaded_id', 'subject_place', 'url', 'uri_descriptions',
       'uris', 'links', 'other_titles', 'work_title', 'openlibrary',
       'original_isbn', 'location', 'work_titles', 'subject_time'],
      dtype='

In [4]:
# Initialize a dictionary to store the answers
answers = {}

In [5]:
# Question 1: How many different books are in the list?
answers['1'] = len(df['title'].unique())

In [6]:
# Question 2: What is the book with the most number of different ISBNs?
most_isbns_book = df.groupby('title')['original_isbn'].nunique().idxmax()
answers['2'] = most_isbns_book

In [20]:
# Question 3: How many books don’t have a goodreads id?
answers['3'] = df['identifiers'].apply(lambda x: 'goodreads' not in x if isinstance(x, dict) else True).sum()

In [8]:
# Question 4: How many books have more than one author?
answers['4'] = (df['authors'].str.count(',') + 1 > 1).sum()

In [9]:
# Question 5: What is the number of books published per publisher?
books_per_publisher = df['publishers'].value_counts().to_dict()
answers['5'] = books_per_publisher

In [10]:
# Question 6: What is the median number of pages for books in this list?
answers['6'] = df['number_of_pages'].median()

In [33]:
# Question 7: What is the month with the most number of published books?

# Convert 'publish_date' column to datetime format
df['publish_date'] = pd.to_datetime(df['publish_date'], errors='coerce')

# Extract the month from the 'publish_date' column
df['publish_month'] = df['publish_date'].dt.month_name()

# Count the number of books published in each month
month_counts = df['publish_month'].value_counts()

# Find the month with the most number of published books
answers['7'] = month_counts.idxmax()

In [55]:
# Question 8: What is/are the longest word/s that appear/s in a book’s description or in the first sentence of a book?

# Function to extract the first sentence from the description
def extract_first_sentence(description):
    if isinstance(description, str):
        return description.split('.')[0]  # Assuming the first sentence ends with a period
    return ''

# Apply the function to create a new column 'first_sentence'
df['first_sentence'] = df['description'].apply(extract_first_sentence)

# Function to find the longest word in a sentence
def find_longest_word(sentence):
    words = sentence.split()
    longest_word = max(words, key=len) 
    return longest_word

# Find the longest word in the 'description' and 'first_sentence' columns
longest_words_description = df['description'].apply(find_longest_word)
longest_words_first_sentence = df['first_sentence'].apply(find_longest_word)

# Combine the longest words from both columns
all_longest_words = longest_words_description.append(longest_words_first_sentence).drop_duplicates()

answers['8'] = all_longest_words.tolist()

AttributeError: 'float' object has no attribute 'split'

In [30]:
# Question 9: What was the last book published in the list?

# Convert the 'publish_date' column to datetime format
df['publish_date'] = pd.to_datetime(df['publish_date'], errors='coerce')

# Sort the dataframe by 'publish_date' in descending order
df_sorted = df.sort_values('publish_date', ascending=False)

# Get the title of the last book published
answers['9'] = df_sorted.iloc[0]['title']

In [56]:
# Question 10: What is the year of the most updated entry in the list?

# Convert the 'last_modified' column to datetime format
df['last_modified'] = pd.to_datetime(df['last_modified'].apply(lambda x: x['value']))

# Extract the year from the 'last_modified' column
df['last_modified_year'] = df['last_modified'].dt.year

# Find the year of the most updated entry
answers['10'] = df['last_modified_year'].max()

TypeError: string indices must be integers

In [15]:
# Question 11: What is the title of the second published book for the author with the highest number of different titles in the list?
author_with_highest_titles = df['authors'].value_counts().idxmax()
second_published_book = df[df['authors'] == author_with_highest_titles]['title'].iloc[1]
answers['11'] = second_published_book

In [16]:
# Question 12: What is the pair of (publisher, author) with the highest number of books published?
publisher_author_pair = df.groupby(['publishers', 'authors']).size().idxmax()
answers['12'] = publisher_author_pair

In [17]:
# Log the answers to a file
with open('answers.txt', 'w') as file:
    for question, answer in answers.items():
        file.write(f"Question {question}: {answer}\n")