In [79]:
# import
import pandas as pd
import numpy as np

from colorama import Fore, Back, Style

In [80]:
# ETL process
# A) Check the values of the ratings, books
# B) Remove NaN values of the ratings, books
# C) Merge datasets of the ratings, books
# D) Remove duplicate rows
# E) Convert data types
# F) Save the cleaned data


In [81]:
# load ratings
ratings = pd.read_csv('Original_Ratings.csv', encoding='CP1251', sep=',', low_memory=False)

In [82]:
# A) Check the values of the ratings
ratings.head()
ratings.info()
ratings['Book-Rating'].unique()

# ratings['ISBN'].unique()

# It is used in the ISBN-10 system because the last position requires a digit that satisfies the checksum algorithm (modulo 11)
# ratings['User-ID'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

In [83]:
# Check the 0 values of the ratings

ratings_zero_inkl = ratings
ratings = ratings[ratings['Book-Rating']!=0] # remove 0
ratings
ratings_zero_inkl

ratio = round(len(ratings)/len(ratings_zero_inkl),2) * 100
type(ratio)

print(f'ratio of ratings without 0 the entire dataset: {ratio}')
print(Fore.BLUE + Back.GREEN + f'IMPROVE 1) acquisition of the complete ratings for the entire dataset, save {100 - ratio}%' + Fore.RESET)

ratio of ratings without 0 the entire dataset: 38.0
[34m[42mIMPROVE 1) acquisition of the complete ratings for the entire dataset, save 62.0%[39m


In [84]:
ratings.info() # complete dataset

# B) Remove NaN values of the ratings
# pass

<class 'pandas.core.frame.DataFrame'>
Index: 433671 entries, 1 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      433671 non-null  int64 
 1   ISBN         433671 non-null  object
 2   Book-Rating  433671 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 13.2+ MB


In [85]:
# load books
books_raw = pd.read_csv('Original_Books.csv',  encoding='CP1251', sep=',') #on_bad_lines='skip' - nechceme přijít o data
# DtypeWarning: Columns (3) have mixed types.

# A) Check the values of the ratings
books_raw.head()
books_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


  books_raw = pd.read_csv('Original_Books.csv',  encoding='CP1251', sep=',') #on_bad_lines='skip' - nechceme přijít o data


In [86]:
# Year-Of-Publication mixed types
books_raw['Year-Of-Publication'].unique()

# Filter the rows with the values 'DK Publishing Inc', 'Gallimard'
books_raw[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard'])]

# books['Book-Author'] = books.loc[books['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Year-Of-Publication']

indexes = list(books_raw[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard'])].index)
indexes


[209538, 220731, 221678]

In [87]:
# column values move one step to the right
books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Image-URL-L'] = books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Image-URL-M']
books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Image-URL-M'] = books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Image-URL-S']
books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Image-URL-S'] = books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Publisher']
books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Publisher'] = books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Year-Of-Publication']
books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Year-Of-Publication'] = books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Book-Author']
books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Book-Author'] = books_raw.loc[books_raw['Year-Of-Publication'].isin(['DK Publishing Inc', 'Gallimard']), 'Book-Title']

In [88]:
# 'Book-Title' and 'Book-Author' are in the same column - > split

books_raw.loc[indexes]

# original copy
books = books_raw.copy()

for index in indexes:

    books_for_cycle = books_raw.copy()
   
    if 'Jean-Marie Gustave' in books_for_cycle.loc[index, 'Book-Title']:
        autor = 'Jean-Marie Gustave Le Clézio'  # J. M. G. Le Clézio
        new_title = books_for_cycle.loc[index, 'Book-Title'].split(';')[0].strip(r'\'"\\').replace("\\", "").replace("'", "").replace('"', "")
          
        # label indexing
        books.loc[index, 'Book-Title'] = new_title
        books.loc[index, 'Book-Author'] = autor
        
    else:
        new_title = books_for_cycle.loc[index, 'Book-Title'].split(';')[0].replace('\\', '').replace('"', '').replace("'", '')
        autor = books_for_cycle.loc[index, 'Book-Title'].split(';')[-1].replace('"', '').strip()
                
        # label indexing
        books.loc[index, 'Book-Title'] = new_title
        books.loc[index, 'Book-Author'] = autor


indexes
books.loc[[209538, 220731, 221678]]


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",Michael Teitelbaum,2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...
220731,2070426769,"Peuple du ciel, suivi de Les Bergers",Jean-Marie Gustave Le Clézio,2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",James Buckley,2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...


In [89]:
books.info()

# Zjistit, kde jsou chybějící hodnoty ve sloupci Book-Author, Publisher
missing_authors = books.loc[books['Book-Author'].isnull()]
missing_authors

missing_publishers = books.loc[books['Publisher'].isnull()]
missing_publishers

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271360 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...


In [90]:
# B) Remove NaN values of the ratings
books.loc[books['Publisher'].isnull(), 'Publisher'] = 'Unknown'
books.loc[books['Book-Author'].isnull(), 'Book-Author'] = 'Unknown'


print(books.info(), ratings.info())
# nan_count = books[['Year-Of-Publication', 'Book-Author', 'Publisher', 'Image-URL-L']].isna().sum()
# nan_count # no NaN values left

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271360 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271360 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271360 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 433671 entries, 1 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      433671 non-null  int64 
 1   ISBN         433671 non-null  object
 2   Book-Rating  433671 non-null  int64 
dtypes: int64(2), object(1)
memor

In [91]:
ratings.loc[ratings['User-ID'] < 0, 'User-ID']

Series([], Name: User-ID, dtype: int64)

In [92]:
# C) Merge datasets of the ratings, books

# print(Fore.BLUE + Back.GREEN + f'IMPROVE 2) acquisition of the ratings from users' + Fore.RESET)
# users_ratigs = pd.merge(ratings, books, on=['User-ID'])

dataset = pd.merge(ratings, books, on=['ISBN'])


In [93]:
# D) Remove duplicate rows
dataset.duplicated().sum()
ratings.loc[ratings['User-ID'] < 0, 'User-ID']

Series([], Name: User-ID, dtype: int64)

In [94]:
# E) Convert data types

column_types = {
    'User-ID': 'int64',
    'ISBN': 'string',
    'Book-Rating': 'int64',
    'Book-Title': 'string',
    'Book-Author': 'string',
    'Year-Of-Publication': 'int64',  # nebo 'string', pokud jsou hodnoty smíšené
    'Publisher': 'string',
    'Image-URL-S': 'string',
    'Image-URL-M': 'string',
    'Image-URL-L': 'string'
}

# Data types conversion
for column, dtype in column_types.items():
    dataset[column] = dataset[column].astype(dtype)

Series([], Name: User-ID, dtype: int64)

In [95]:
dataset.info()
dataset_lowercase=dataset.apply(lambda x: x.str.lower() if(x.dtype == 'string') else x)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383842 entries, 0 to 383841
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   User-ID              383842 non-null  int64 
 1   ISBN                 383842 non-null  string
 2   Book-Rating          383842 non-null  int64 
 3   Book-Title           383842 non-null  string
 4   Book-Author          383842 non-null  string
 5   Year-Of-Publication  383842 non-null  int64 
 6   Publisher            383842 non-null  string
 7   Image-URL-S          383842 non-null  string
 8   Image-URL-M          383842 non-null  string
 9   Image-URL-L          383842 non-null  string
dtypes: int64(3), string(7)
memory usage: 29.3 MB


In [None]:
# zůžení výběru na TOLKIEN readery a knihu
tolkien_readers = dataset_lowercase.loc[((dataset_lowercase['Book-Title']=='the fellowship of the ring (the lord of the rings, part 1)') & dataset_lowercase['Book-Author'].str.contains("tolkien")), 'User-ID']
tolkien_readers = tolkien_readers.tolist()
tolkien_readers = np.unique(tolkien_readers)
tolkien_readers



array([   254,   1674,  11676,  11944,  13191,  16601,  16777,  16795,
        17725,  22818,  23188,  23571,  23699,  23872,  25123,  25981,
        26057,  30711,  33816,  35108,  36562,  36907,  39467,  43642,
        44755,  44924,  45340,  46690,  48355,  49225,  50615,  51373,
        52350,  54898,  56623,  57398,  60244,  63360,  67198,  67995,
        74411,  74792,  74844,  79310,  79475,  80071,  81492,  81560,
        81848,  81982,  82899,  83496,  86423,  86451,  87555,  87938,
        88320,  88733,  88937,  92693,  92951,  96744,  98783, 100004,
       102702, 104880, 105221, 105476, 109135, 110265, 110962, 111847,
       113380, 114355, 115536, 115572, 116123, 116904, 119858, 124060,
       124876, 125774, 126388, 128946, 131154, 133957, 135458, 136354,
       136382, 136491, 137118, 137742, 138543, 138995, 141693, 142149,
       142623, 142672, 143856, 144114, 148028, 149198, 149398, 149483,
       150894, 150896, 151847, 153662, 154811, 155495, 159506, 160434,
      

In [110]:
# final dataset
books_of_tolkien_readers = dataset_lowercase[(dataset_lowercase['User-ID'].isin(tolkien_readers))]
books_of_tolkien_readers

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1533,278246,0618129030,8,the fellowship of the ring (the lord of the ri...,j. r. r. tolkien,,houghton mifflin company,http://images.amazon.com/images/p/0618129030.0...,http://images.amazon.com/images/p/0618129030.0...,http://images.amazon.com/images/p/0618129030.0...
1843,278550,0028630343,7,"frommer's 2000 bahamas (frommer's bahamas, 2000)",arthur frommer,1999,"hungry minds, inc",http://images.amazon.com/images/p/0028630343.0...,http://images.amazon.com/images/p/0028630343.0...,http://images.amazon.com/images/p/0028630343.0...
1844,278550,0345339703,10,the fellowship of the ring (the lord of the ri...,j.r.r. tolkien,,del rey,http://images.amazon.com/images/p/0345339703.0...,http://images.amazon.com/images/p/0345339703.0...,http://images.amazon.com/images/p/0345339703.0...
1845,278550,0673397394,8,study and thinking skills in college,kathleen t mcwhorter,1988,"scott, foresman",http://images.amazon.com/images/p/0673397394.0...,http://images.amazon.com/images/p/0673397394.0...,http://images.amazon.com/images/p/0673397394.0...
2252,254,0060502320,7,"i've got you, babe",karen kendall,,avon,http://images.amazon.com/images/p/0060502320.0...,http://images.amazon.com/images/p/0060502320.0...,http://images.amazon.com/images/p/0060502320.0...
...,...,...,...,...,...,...,...,...,...,...
383332,276313,0449906736,5,flashpoints: promise and peril in a new world,robin wright,,ballantine books,http://images.amazon.com/images/p/0449906736.0...,http://images.amazon.com/images/p/0449906736.0...,http://images.amazon.com/images/p/0449906736.0...
383333,276313,0671027360,9,angels &amp; demons,dan brown,,pocket star,http://images.amazon.com/images/p/0671027360.0...,http://images.amazon.com/images/p/0671027360.0...,http://images.amazon.com/images/p/0671027360.0...
383334,276313,0688156568,5,culture jam: the uncooling of america,kalle lasn,,william morrow &amp; company,http://images.amazon.com/images/p/0688156568.0...,http://images.amazon.com/images/p/0688156568.0...,http://images.amazon.com/images/p/0688156568.0...
383335,276313,0812511816,9,"the eye of the world (the wheel of time, book 1)",robert jordan,,tor fantasy,http://images.amazon.com/images/p/0812511816.0...,http://images.amazon.com/images/p/0812511816.0...,http://images.amazon.com/images/p/0812511816.0...


In [100]:
#users_ratigs = pd.merge(ratings, users, on=['User-ID'])
dataset = pd.merge(ratings, books, on=['ISBN'])
dataset_lowercase=dataset.apply(lambda x: x.str.lower() if(x.dtype == 'object') else x)

tolkien_readers = dataset_lowercase['User-ID'][(dataset_lowercase['Book-Title']=='the fellowship of the ring (the lord of the rings, part 1)') & (dataset_lowercase['Book-Author'].str.contains("tolkien"))]
tolkien_readers = tolkien_readers.tolist()
tolkien_readers = np.unique(tolkien_readers)

In [101]:
# final dataset
books_of_tolkien_readers = dataset_lowercase[(dataset_lowercase['User-ID'].isin(tolkien_readers))]

In [143]:
# Number of ratings per other books in dataset
number_of_rating_per_book = books_of_tolkien_readers.groupby('Book-Title').agg('count')
number_of_rating_per_book.sort_values(by='Book-Rating', ascending=False)

# Average rating per book

# Seskupit podle názvu knihy a vypočítat průměrné hodnocení
avg_rating_per_book = books_of_tolkien_readers.groupby('Book-Title').agg(
    avg_rating=('Book-Rating', 'mean'),
    count_rating=('Book-Rating', 'count')).reset_index()
avg_rating_per_book.sort_values(by=['count_rating', 'avg_rating'], ascending=False)

print('IMPROVE 3) weighted average by the number of ratings and count of ratings')

Unnamed: 0,Book-Title,avg_rating,count_rating
13441,the fellowship of the ring (the lord of the ri...,8.882927,205
15616,"the two towers (the lord of the rings, part 2)",9.573333,75
14995,"the return of the king (the lord of the rings,...",9.410959,73
13813,the hobbit : the enchanting prelude to the lor...,9.052632,38
5770,harry potter and the chamber of secrets (book 2),8.972973,37
...,...,...,...
17312,"winning battle (temptation, no 236)",1.000000,1
17322,winter of fire (point),1.000000,1
17448,wormwood: a collection of short stories,1.000000,1
17510,years of city,1.000000,1


In [103]:
#select only books which have actually higher number of ratings than threshold
books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= 8]
books_to_compare = books_to_compare.tolist()

ratings_data_raw = books_of_tolkien_readers[['User-ID', 'Book-Rating', 'Book-Title']][books_of_tolkien_readers['Book-Title'].isin(books_to_compare)]

In [104]:
# group by User and Book and compute mean
ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()

# reset index to see User-ID in every row
ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()

dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')

LoR_list = ['the fellowship of the ring (the lord of the rings, part 1)']

result_list = []
worst_list = []


In [105]:
# for each of the trilogy book compute:
for LoR_book in LoR_list:
    
    #Take out the Lord of the Rings selected book from correlation dataframe
    dataset_of_other_books = dataset_for_corr.copy(deep=False)
    dataset_of_other_books.drop([LoR_book], axis=1, inplace=True)
      
    # empty lists
    book_titles = []
    correlations = []
    avgrating = []

    # corr computation
    for book_title in list(dataset_of_other_books.columns.values):
        book_titles.append(book_title)
        correlations.append(dataset_for_corr[LoR_book].corr(dataset_of_other_books[book_title]))
        tab=(ratings_data_raw[ratings_data_raw['Book-Title']==book_title].groupby(ratings_data_raw['Book-Title']).mean())
        avgrating.append(tab['Book-Rating'].min())
    # final dataframe of all correlation of each book   
    corr_fellowship = pd.DataFrame(list(zip(book_titles, correlations, avgrating)), columns=['book','corr','avg_rating'])
    corr_fellowship.head()

    # top 10 books with highest corr
    result_list.append(corr_fellowship.sort_values('corr', ascending = False).head(10))
    
    #worst 10 books
    worst_list.append(corr_fellowship.sort_values('corr', ascending = False).tail(10))
    
print("Correlation for book:", LoR_list[0])
#print("Average rating of LOR:", ratings_data_raw[ratings_data_raw['Book-Title']=='the fellowship of the ring (the lord of the rings, part 1'].groupby(ratings_data_raw['Book-Title']).mean()))
rslt = result_list[0]

TypeError: agg function failed [how->mean,dtype->object]