In [1]:
import pandas as pd

### Welcome to the Apply Challenge

In [2]:
link = 'https://raw.githubusercontent.com/chriszapp/datasets/main/books.csv'
# want to have at least a bit bigger set
books = pd.read_csv(link, nrows=13)
books.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [3]:
# Firstly, have a look at the dataset
print(f'The shape {books.shape} \n\n\n')
print(books.dtypes)

The shape (13, 12) 



bookID                  int64
title                  object
authors                object
average_rating        float64
isbn                   object
isbn13                  int64
language_code          object
  num_pages             int64
ratings_count           int64
text_reviews_count      int64
publication_date       object
publisher              object
dtype: object


In [4]:
# Mission 1: use apply to create a new column called 'reviews_opinion': if the ratings is between 4.5 and 5, the reviews_opinion is 'very good';
# if the rating is between 4 and 4.5, the reviews_opinion is 'good'; if the rating is between 3 and 4, the reviews_opinions will be 'neutral';
# if the rating is up to 3 the reviews_opinion is going to be 'bad'

# Add your code here
def add_opinion(rating: float) -> str:
  if rating >= 4.5:
    return 'very good'
  if rating >= 4:
    return 'good'
  if rating >= 3:
    return 'neutral'
  return 'bad'

books['reviews_opinion'] = books['average_rating'].apply(add_opinion)

books.loc[:, ['title', 'average_rating', 'reviews_opinion']]

Unnamed: 0,title,average_rating,reviews_opinion
0,Harry Potter and the Half-Blood Prince (Harry ...,4.57,very good
1,Harry Potter and the Order of the Phoenix (Har...,4.49,good
2,Harry Potter and the Chamber of Secrets (Harry...,4.42,good
3,Harry Potter and the Prisoner of Azkaban (Harr...,4.56,very good
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,4.78,very good
5,"Unauthorized Harry Potter Book Seven News: ""Ha...",3.74,neutral
6,Harry Potter Collection (Harry Potter #1-6),4.73,very good
7,The Ultimate Hitchhiker's Guide: Five Complete...,4.38,good
8,The Ultimate Hitchhiker's Guide to the Galaxy ...,4.38,good
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...,4.22,good


In [5]:
# Mission 2: using apply with a function, create a column with the number of authors per book
# (note that in the column 'authors', each name of the author is divided with the character '/')

def add_author_number(authors_str: str) -> int:
  authors = authors_str.split('/')
  return len(authors)

books['authors_number'] = books['authors'].apply(add_author_number)
books.loc[:, ['title', 'authors', 'authors_number']]

Unnamed: 0,title,authors,authors_number
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,2
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,2
2,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,1
3,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,2
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,2
5,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,1
6,Harry Potter Collection (Harry Potter #1-6),J.K. Rowling,1
7,The Ultimate Hitchhiker's Guide: Five Complete...,Douglas Adams,1
8,The Ultimate Hitchhiker's Guide to the Galaxy ...,Douglas Adams,1
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,1


In [6]:
# Mission 3: rewrite the same task as Mission 2 using the lamdba function

# Add your code here
books.drop(['authors_number'], axis=1)

books['authors_number'] = books['authors'].apply(lambda authors_str: len(authors_str.split('/')))
books.loc[:, ['title', 'authors', 'authors_number']]

Unnamed: 0,title,authors,authors_number
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,2
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,2
2,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,1
3,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,2
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,2
5,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,1
6,Harry Potter Collection (Harry Potter #1-6),J.K. Rowling,1
7,The Ultimate Hitchhiker's Guide: Five Complete...,Douglas Adams,1
8,The Ultimate Hitchhiker's Guide to the Galaxy ...,Douglas Adams,1
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...,Douglas Adams,1


In [7]:
# Mission 4: create a column 'comment_ration'.
# Use the columns 'ratings_count' and 'text_reviews_count' to calculate the percentage of how many text review per reviews are per book:
# percentage = text / reviews * 100
# Use the mathematical operations between columns

# Add your code here
books['comment_ratio'] = books['text_reviews_count'].div(books['ratings_count']).mul(100).round(2)
books.loc[:, ['title', 'text_reviews_count', 'ratings_count', 'comment_ratio']]

Unnamed: 0,title,text_reviews_count,ratings_count,comment_ratio
0,Harry Potter and the Half-Blood Prince (Harry ...,27591,2095690,1.32
1,Harry Potter and the Order of the Phoenix (Har...,29221,2153167,1.36
2,Harry Potter and the Chamber of Secrets (Harry...,244,6333,3.85
3,Harry Potter and the Prisoner of Azkaban (Harr...,36325,2339585,1.55
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,164,41428,0.4
5,"Unauthorized Harry Potter Book Seven News: ""Ha...",1,19,5.26
6,Harry Potter Collection (Harry Potter #1-6),808,28242,2.86
7,The Ultimate Hitchhiker's Guide: Five Complete...,254,3628,7.0
8,The Ultimate Hitchhiker's Guide to the Galaxy ...,4080,249558,1.63
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...,460,4930,9.33


In [10]:
# Mission 5: obtain the same result of Mission 4 using apply with a function.
# Try both methods to apply to the whole dataset and apply to selected columns

# Add your code here
def get_comment_ratio(row) -> float:
  text_reviews_count = row['text_reviews_count']
  ratings_count = row['ratings_count']
  return round(text_reviews_count / ratings_count, 2)


books.drop(['comment_ratio'], axis=1)

books['comment_ration'] = books.apply(get_comment_ratio, axis=1)
books.loc[:, ['title', 'text_reviews_count', 'ratings_count', 'comment_ratio']]

Unnamed: 0,title,text_reviews_count,ratings_count,comment_ratio
0,Harry Potter and the Half-Blood Prince (Harry ...,27591,2095690,1.32
1,Harry Potter and the Order of the Phoenix (Har...,29221,2153167,1.36
2,Harry Potter and the Chamber of Secrets (Harry...,244,6333,3.85
3,Harry Potter and the Prisoner of Azkaban (Harr...,36325,2339585,1.55
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,164,41428,0.4
5,"Unauthorized Harry Potter Book Seven News: ""Ha...",1,19,5.26
6,Harry Potter Collection (Harry Potter #1-6),808,28242,2.86
7,The Ultimate Hitchhiker's Guide: Five Complete...,254,3628,7.0
8,The Ultimate Hitchhiker's Guide to the Galaxy ...,4080,249558,1.63
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...,460,4930,9.33


In [11]:
# Mission 6: obtain the same result of Mission 5 using apply lambda
# Try both methods to apply to the whole dataset and apply to selected columns

books.drop(['comment_ratio'], axis=1)

books['comment_ratio'] = books.apply(lambda x: round(x['text_reviews_count'] / x['ratings_count'] * 100, 2), axis=1)
books.loc[:, ['title', 'text_reviews_count', 'ratings_count', 'comment_ratio']]



Unnamed: 0,title,text_reviews_count,ratings_count,comment_ratio
0,Harry Potter and the Half-Blood Prince (Harry ...,27591,2095690,1.32
1,Harry Potter and the Order of the Phoenix (Har...,29221,2153167,1.36
2,Harry Potter and the Chamber of Secrets (Harry...,244,6333,3.85
3,Harry Potter and the Prisoner of Azkaban (Harr...,36325,2339585,1.55
4,Harry Potter Boxed Set Books 1-5 (Harry Potte...,164,41428,0.4
5,"Unauthorized Harry Potter Book Seven News: ""Ha...",1,19,5.26
6,Harry Potter Collection (Harry Potter #1-6),808,28242,2.86
7,The Ultimate Hitchhiker's Guide: Five Complete...,254,3628,7.0
8,The Ultimate Hitchhiker's Guide to the Galaxy ...,4080,249558,1.63
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...,460,4930,9.33


In [None]:
# Mission 7: extract the year from the publication_date column. How many years of publications are recorded in this dataset? Which are them?
from datetime import datetime

books['publication_year'] = books['publication_date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y').year)
year_counts = books.value_counts(books['publication_year'])

print(f'Number of unique publication years: {year_counts.size}\n')
year_counts

Number of unique publication years: 6



publication_year
2004    5
2005    4
1996    1
2002    1
2003    1
2006    1
dtype: int64