In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# Read the books and the ratings data
books = pd.read_csv("./Resources/Books.csv", low_memory = False)
ratings = pd.read_csv("./Resources/Ratings.csv", low_memory = False)

#Combine both into a single DataFrame
book_ratings = books.merge(ratings, how = "outer", on = "ISBN")

# Display data for preview
book_ratings.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,,
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0


In [3]:
book_ratings.dtypes

ISBN                    object
Book-Title              object
Book-Author             object
Year-Of-Publication     object
Publisher               object
User-ID                float64
Book-Rating            float64
dtype: object

In [4]:
book_ratings.count()

ISBN                   555195
Book-Title             505366
Book-Author            505365
Year-Of-Publication    505366
Publisher              505364
User-ID                433671
Book-Rating            433671
dtype: int64

In [5]:
book_ratings = book_ratings.dropna(how = "any")

In [6]:
book_ratings.count()

ISBN                   383839
Book-Title             383839
Book-Author            383839
Year-Of-Publication    383839
Publisher              383839
User-ID                383839
Book-Rating            383839
dtype: int64

In [7]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8.0,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676.0,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544.0,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866.0,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629.0,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318.0,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970.0,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313.0,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463.0,7.0


In [8]:
book_ratings["User-ID"] = pd.to_numeric(book_ratings["User-ID"], errors='coerce')

In [9]:
book_ratings["User-ID"] = book_ratings["User-ID"].astype(int)

In [10]:
book_ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,User-ID,Book-Rating
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,8,5.0
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,11676,8.0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,67544,8.0
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,116866,9.0
5,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,123629,9.0
...,...,...,...,...,...,...,...
505352,0395264707,Dreamsnake,Vonda N. McIntyre,1978,Houghton Mifflin,275318,10.0
505358,1845170423,Cocktail Classics,David Biggs,2004,Connaught,275970,7.0
505360,0449906736,Flashpoints: Promise and Peril in a New World,Robin Wright,1993,Ballantine Books,276313,5.0
505361,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),276463,7.0


In [11]:
book_ratings = book_ratings.reset_index()

In [None]:
# Assuming book_ratings is a DataFrame with a "Book-Title" column
title = book_ratings["Book-Title"]
counter = 0

def translate_to_english(title, translator):
    global counter
    try:
        # Translate the title to English
        translation = translator.translate(title, dest='en')
        counter += 1
        clear_output(wait=True)  # Clear the previous output
        display(f"Current count: {counter}")  # Display only the current count
        return translation.text
    except Exception as e:
        # Handle any potential errors during translation
#         print(f"Error translating '{title}': {str(e)}")
        return title

# Create a Translator object
translator = Translator()

# Apply translation to the 'Title' column
book_ratings['Translated Title'] = book_ratings["Book-Title"].apply(lambda x: translate_to_english(x, translator))

# Display the DataFrame with translated titles
print(book_ratings)


'Current count: 80627'

In [14]:
!pip install --upgrade transformers

Collecting sentencepiece (from transformers)
  Downloading sentencepiece-0.1.99-cp311-cp311-win_amd64.whl (977 kB)
     ---------------------------------------- 0.0/977.5 kB ? eta -:--:--
     --------- ---------------------------- 256.0/977.5 kB 7.9 MB/s eta 0:00:01
     ----------------------------- -------- 768.0/977.5 kB 9.7 MB/s eta 0:00:01
     -------------------------------------- 977.5/977.5 kB 8.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [16]:
from transformers import AutoModelForConditionalGeneration, AutoTokenizer

model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForConditionalGeneration.from_pretrained(model_name)

ImportError: cannot import name 'AutoModelForConditionalGeneration' from 'transformers' (C:\Users\Seeke\anaconda3\Lib\site-packages\transformers\__init__.py)

In [15]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm.auto import tqdm

# Initialize the model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Assuming book_ratings is a DataFrame with a "Book-Title" column
titles = book_ratings["Book-Title"].tolist()

def translate_titles(titles, model, tokenizer, batch_size=10):
    translated_texts = []
    for i in tqdm(range(0, len(titles), batch_size)):
        batch = titles[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids
        outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
        translated_batch = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        translated_texts.extend(translated_batch)
    return translated_texts

# Translate titles
translated_titles = translate_titles(titles, model, tokenizer)

# Update DataFrame
book_ratings['Translated Title'] = translated_titles

# Display or save your DataFrame
print(book_ratings.head())





ImportError: cannot import name 'T5ForConditionalGeneration' from 'transformers' (C:\Users\Seeke\anaconda3\Lib\site-packages\transformers\__init__.py)

In [None]:
book_ratings.nunique()

In [None]:
# Use Label Encoding from scikit-learn to encode ISBNs which are categorical
from sklearn.preprocessing import LabelEncoder

isbn_encoder = LabelEncoder()
book_ratings["ISBN_encoded"] = isbn_encoder.fit_transform(book_ratings["ISBN"])

# Reverse the encoding - we will use this to reverse the encoding later if needed
# book_ratings['ISBN_decoded'] = isbn_encoder.inverse_transform(book_ratings['ISBN_encoded'])

In [None]:
book_ratings.dtypes

In [None]:
display(book_ratings.loc(["ISBN_encoded"] == "074322678X"))


In [None]:
book_ratings.to_csv("Output/book_ratings.csv", index = False)

In [None]:
n_users = book_ratings["User-ID"].unique().shape[0]
n_items = book_ratings["ISBN_encoded"].unique().shape[0]
n_items = book_ratings["ISBN_encoded"].max()
A = np.zeros((n_users, n_items))

for line in dataset.itertuples():
    A[line[1] - 1, line[2] - 1] = line[3]


In [None]:
non_integer_rows = book_ratings[~book_ratings['ISBN_encoded'].apply(lambda x: isinstance(x, int))]

In [None]:
non_integer_rows