In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# Read the books and the ratings data
books = pd.read_csv("./Resources/Books.csv", low_memory = False)
ratings = pd.read_csv("./Resources/Ratings.csv", low_memory = False)

#Combine both into a single DataFrame
book_ratings = books.merge(ratings, how = "outer", on = "ISBN")

# Display data for preview
book_ratings.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,,
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0


In [3]:
book_ratings.dtypes

ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
User-ID                float64
Book-Rating            float64
dtype: object

In [4]:
book_ratings.count()

ISBN                   555195
Book-Title             505366
Book-Author            505365
Year-Of-Publication    505366
Publisher              505364
User-ID                433671
Book-Rating            433671
dtype: int64

In [5]:
book_ratings = book_ratings.dropna(how = "any")

In [6]:
book_ratings.count()

ISBN                   383839
Book-Title             383839
Book-Author            383839
Year-Of-Publication    383839
Publisher              383839
User-ID                383839
Book-Rating            383839
dtype: int64

In [7]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629.0,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318.0,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970.0,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313.0,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463.0,7.0


In [8]:
book_ratings["User-ID"] = pd.to_numeric(book_ratings["User-ID"], errors='coerce')

In [9]:
book_ratings["User-ID"] = book_ratings["User-ID"].astype(int)

In [10]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,7.0


In [15]:
# Install translator
# !pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
     ---------------------------------------- 0.0/55.1 kB ? eta -:--:--
     ---------------------------------------- 55.1/55.1 kB 1.4 MB/s eta 0:00:00
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Obtaining dependency information for hstspreload from https://files.pythonhosted.org/packages/b5/9f/83329ebd2808e04f2564051e4c4a880a1e2e67bd6410899f728096d0e22f/hstspreload-2024.2.1-py3-none-any.whl.metadata
  Downloading hstspreload-2024.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
     ---------------------------------------- 0.0/133.4 kB ? eta -:--:--
     --

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 2.1.1 requires sentencepiece, which is not installed.


In [19]:
from googletrans import Translator

title = book_ratings["Book-Title"]

def translate_to_english(title, translator):
    try:
        # Translate the title to English
        translation = translator.translate(title, dest='en')
        return translation.text
    except Exception as e:
        # Handle any potential errors during translation
#         print(f"Error translating '{title}': {str(e)}")
        return title

# Create a Translator object
translator = Translator()

# Apply translation to the 'Title' column
book_ratings['Translated Title'] = book_ratings["Book-Title"].apply(lambda x: translate_to_english(x, translator))

# Display the DataFrame with translated titles
print(book_ratings)

              ISBN                                     Book-Title  \
1       0002005018                                   Clara Callan   
2       0002005018                                   Clara Callan   
3       0002005018                                   Clara Callan   
4       0002005018                                   Clara Callan   
5       0002005018                                   Clara Callan   
...            ...                                            ...   
505352  0395264707                                     Dreamsnake   
505358  1845170423                              Cocktail Classics   
505360  0449906736  Flashpoints: Promise and Peril in a New World   
505361  0440400988                     There's a Bat in Bunk Five   
505362  0525447644                        From One to One Hundred   

                 Book-Author Year-Of-Publication  \
1       Richard Bruce Wright                2001   
2       Richard Bruce Wright                2001   
3       Richard

In [20]:
book_ratings = book_ratings.reset_index()

In [12]:
book_ratings.nunique()

index                  383839
ISBN                   149833
Book-Title             135565
Book-Author             62112
Year-Of-Publication       106
Publisher               11574
User-ID                 68091
Book-Rating                10
dtype: int64

In [21]:
# Update the datatype for 'Year-Of-Publication' field to numeric
book_ratings['Year-Of-Publication'] = pd.to_numeric(book_ratings['Year-Of-Publication'], errors='coerce')

In [22]:
# remove duplicated ISBN records, if any
book_ratings = book_ratings.drop_duplicates(subset=['ISBN'])

In [23]:
book_ratings.count()

index                  149833
ISBN                   149833
Book-Title             149833
Book-Author            149833
Year-Of-Publication    149832
Publisher              149833
User-ID                149833
Book-Rating            149833
Translated Title       149833
dtype: int64

In [24]:
book_ratings = book_ratings.dropna(how = "any")

In [25]:
book_ratings.count()

index                  149832
ISBN                   149832
Book-Title             149832
Book-Author            149832
Year-Of-Publication    149832
Publisher              149832
User-ID                149832
Book-Rating            149832
Translated Title       149832
dtype: int64

In [26]:
rows_with_zero_value = book_ratings[book_ratings['Year-Of-Publication'] == 0]
rows_with_zero_value.head()

Unnamed: 0,index,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating,Translated Title
4982,5000,3150000335,Kabale Und Liebe,Schiller,0.0,"Philipp Reclam, Jun Verlag GmbH",242,10.0,Kabale Und Liebe
5017,5035,342311360X,Die Liebe in Den Zelten,Gabriel Garcia Marquez,0.0,Deutscher Taschenbuch Verlag (DTV),242,10.0,The love in the tents
7785,7815,0571197639,Poisonwood Bible Edition Uk,Barbara Kingsolver,0.0,Faber Faber Inc,11676,6.0,Poisonwood Bible Edition Uk
8625,8659,3596214629,"Herr Der Fliegen (Fiction, Poetry and Drama)",Golding,0.0,Fischer Taschenbuch Verlag GmbH,276994,8.0,"Lord of the flies (fiction, poetry and drama)"
13298,13368,8845229041,Biblioteca Universale Rizzoli: Sulla Sponda De...,P Coelho,0.0,Fabbri - RCS Libri,460,3.0,Rizzoli Universal Library: on the bank of the ...
...,...,...,...,...,...,...,...,...,...
383409,504325,0713715200,American Filmmakers Today,Dian Smith,0.0,Sterling*+ Publishing Company,243223,5.0,American Filmmakers Today
383429,504379,0553026828,Dissertation,R M Koster,0.0,Bantam Doubleday Dell,243490,8.0,Dissertation
383441,504402,0517209802,Christmas Memories With Recipes,Compilation,0.0,Random House,243700,5.0,Christmas Memories With Recipes
383479,504489,0571204163,Headlong,Michael Frayn,0.0,Faber Faber Inc,243942,8.0,Headlong


In [27]:
book_ratings = book_ratings[book_ratings['Year-Of-Publication'] != 0]

In [29]:
book_ratings = book_ratings.drop(book_ratings[book_ratings['Year-Of-Publication'] == 1378].index)
book_ratings = book_ratings.drop(book_ratings[book_ratings['Year-Of-Publication'] == 1376].index)

In [30]:
book_ratings['Year-Of-Publication'] = pd.to_datetime(book_ratings['Year-Of-Publication'], format='%Y')

In [31]:
book_ratings.dtypes

index                           int64
ISBN                           object
Book-Title                     object
Book-Author                    object
Year-Of-Publication    datetime64[ns]
Publisher                      object
User-ID                         int32
Book-Rating                   float64
Translated Title               object
dtype: object

In [32]:
book_ratings.to_csv("Output/clean_translated_book_ratings.csv", index = False)