# Book-Title preprocessing

## Set up environment

In [17]:
import os
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize the stop words
stop_words = set(stopwords.words('english'))

# vectorizing the book info column using TFidf Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\andyd\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andyd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andyd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andyd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/raw/")

In [19]:
cwd = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir))

In [20]:
os.listdir(path)

['BX-Books.csv',
 'BX-NewBooks.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooksUsers.csv',
 'BX-Ratings.csv',
 'BX-Users.csv']

In [21]:
books = pd.read_csv(path + 'BX-Books.csv')

In [22]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Return the processed text as a string
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[f"{column_name}"] = df[column_name].apply(preprocess_text)
    return df

In [23]:
books = preprocess_dataframe(books, "Book-Title")
books = preprocess_dataframe(books, "Book-Publisher")
books = preprocess_dataframe(books, "Book-Author")

In [24]:
books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher
0,0002005018,clara callan,richard bruce wright,2001,harperflamingo canada
1,0374157065,flu story great influenza pandemic 1918 search...,gina bari kolata,1999,farrar straus giroux
2,0399135782,kitchen god wife,amy tan,1991,putnam pub group
3,0440234743,testament,john grisham,1999,dell
4,0452264464,beloved plume contemporary fiction,toni morrison,1994,plume
...,...,...,...,...,...
18180,0375411615,love etc,julian barnes,2001,alfred knopf
18181,0836227751,wit whimsy mary engelbreit,mary engelbreit,1997,andrew mcmeel publishing
18182,8433966634,los detective salvajes,roberto bolano,2003,anagrama
18183,0330353349,ice house tv tie edition,minette walter,1997,mcclelland stewart


In [None]:
books.to_csv("BX-Cleaned-Books.csv", index=False)