# Book-Title preprocessing

## Set up environment

In [19]:
import os
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize the stop words
stop_words = set(stopwords.words('english'))

# vectorizing the book info column using TFidf Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw/")

In [12]:
cwd = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [13]:
os.listdir(path)

['BX-NewBooksUsers.csv',
 'BX-Books.csv',
 'BX-Ratings.csv',
 'BX-Users.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooks.csv']

In [14]:
books = pd.read_csv(path + 'BX-Books.csv')

In [15]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Return the processed text as a string
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[f"{column_name}-Processed"] = df[column_name].apply(preprocess_text)
    return df

In [52]:
books = preprocess_dataframe(books, "Book-Title")
books = preprocess_dataframe(books, "Book-Publisher")
books = preprocess_dataframe(books, "Book-Author")

In [53]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher,Book-Title-Processed,Book-Publisher-Processed,Book-Author-Processed
0,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,clara callan,harperflamingo canada,richard bruce wright
1,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,flu story great influenza pandemic 1918 search...,farrar straus giroux,gina bari kolata
2,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,kitchen god wife,putnam pub group,amy tan
3,0440234743,The Testament,John Grisham,1999,Dell,testament,dell,john grisham
4,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,beloved plume contemporary fiction,plume,toni morrison
...,...,...,...,...,...,...,...,...
18180,0375411615,"Love, Etc.",Julian Barnes,2001,Alfred A. Knopf,love etc,alfred knopf,julian barnes
18181,0836227751,The Wit And Whimsy Of Mary Engelbreit,Mary Engelbreit,1997,Andrews McMeel Publishing,wit whimsy mary engelbreit,andrew mcmeel publishing,mary engelbreit
18182,8433966634,Los Detectives Salvajes,Roberto Bolano,2003,Anagrama,los detective salvajes,anagrama,roberto bolano
18183,0330353349,The Ice House (TV Tie-In Edition),Minette Walters,1997,McClelland & Stewart,ice house tv tie edition,mcclelland stewart,minette walter


In [65]:
books["info"] = books["Book-Title-Processed"] + ' ' + books["Book-Author-Processed"] + ' ' \
     + books["Year-Of-Publication"].astype(str)

In [66]:
tf = TfidfVectorizer(analyzer = "word", ngram_range=(1,2), min_df=0, stop_words='english')

In [67]:
tfidf_matrix = tf.fit_transform(books['info'])

In [68]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [69]:
indices = pd.Series(books['Book-Title'])

In [70]:
def recommend(title, cosine_sim = cosine_sim):
    if title not in indices.values:
        return "Title not found in the database."
    recommended_books = []
    idx = indices[indices == title].index[0]   # to get the index of book name matching the input book_name
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar books
    # [1:11] to exclude 0 (index 0 is the input book itself)
    
    for i, x in zip(top_10_indices, score_series.iloc[1:11]):   # to append the titles of top 10 similar booksto the recommended_books list
        recommended_books.append([list(books['info'])[i], round(x, 2)])
        
    return recommended_books

In [73]:
recommend("Love, Etc.")

[['talking julian barnes 1992', 0.45],
 ['staring sun julian barnes 0', 0.42],
 ['ex appeal zoe barnes 2001', 0.3],
 ['history world 10 1 2 chapter vintage international julian barnes 1990',
  0.3],
 ['surveillance julian may 1988', 0.16],
 ['hitched z barnes 1998', 0.15],
 ['king albion julian rathbone 2001', 0.14],
 ['many colored land julian may 1994', 0.14],
 ['coyote linda barnes 1991', 0.13],
 ['last english king julian rathbone 1998', 0.13]]

In [77]:
example = tf.fit_transform(["hello"])

In [78]:
cosine_similarity(example, tfidf_matrix)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 1 while Y.shape[1] == 76675

In [80]:
user = pd.read_csv(path + "BX-Ratings.csv")

In [81]:
user

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276744,038550120X,7
1,276754,0684867621,8
2,276755,0451166892,5
3,276762,0380711524,5
4,276772,0553572369,7
...,...,...,...
204159,276688,0892966548,10
204160,276688,1551669315,6
204161,276704,0345386108,6
204162,276704,0743211383,7


In [None]:
user.pivot_table(columns='User-ID', index='ISBN', values="Book-Rating")