# Preprocessing BX-Books.csv
## Set up environment

In [1]:
import os
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import process, fuzz

import nltk
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Initialize the stop words
stop_words = set(stopwords.words('english'))

# vectorizing the book info column using TFidf Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import OrdinalEncoder

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/caseyhaseloff/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/")

os.listdir(path + 'raw/')

['BX-NewBooksUsers.csv',
 'BX-Books.csv',
 'BX-Ratings.csv',
 'BX-Users.csv',
 'BX-NewBooksRatings.csv',
 'BX-NewBooks.csv']

In [3]:
def fuzzy_match_comparison(col):
    
    unique = books[f"Book-{col}"].unique()
    print(len(unique))
    
    score_sort = [(x,) + i
             for x in unique
             for i in process.extract(x, unique, scorer=fuzz.ratio)] 
    
    similarity_sort = pd.DataFrame(score_sort, columns=[f'{col}_sort','match_sort','score_sort'])
    similarity_sort[f'sorted_{col}_sort'] = np.minimum(similarity_sort[f'{col}_sort'], similarity_sort['match_sort'])

    high_score_sort = \
    similarity_sort[(similarity_sort['score_sort'] >= 80) & \
                    (similarity_sort[f'{col}_sort'] !=  similarity_sort['match_sort']) & \
                    (similarity_sort[f'sorted_{col}_sort'] != similarity_sort['match_sort'])] 
    high_score_sort = high_score_sort.drop(f'sorted_{col}_sort',axis=1).copy()

    high_score_sort = high_score_sort.sort_values(
                        ['score_sort'], ascending=False).reset_index()

    return high_score_sort

In [4]:
books = pd.read_csv(path + 'raw/' + 'BX-Books.csv')

### Clean Book Author:

In [5]:
# Define regex pattern to remove non-alphanumeric characters except '&'
pattern = r'[^a-zA-Z0-9& ]'

# Function to clean text based on regex pattern
def clean_text(text):
    cleaned_text = re.sub(pattern, ' ', text)
    return cleaned_text

books["Book-Author-Processed"] = books["Book-Author"].str.lower()
books["Book-Author-Processed"] = books["Book-Author-Processed"].str.replace("'", "")
books["Book-Author-Processed"] = books["Book-Author-Processed"].apply(clean_text).str.replace('  ', ' ').str.strip()

In [6]:
%%time
author_fuzzy_match = fuzzy_match_comparison("Author-Processed")

5960
CPU times: user 52.8 s, sys: 1.28 s, total: 54.1 s
Wall time: 56.9 s


In [7]:
author_fuzzy_match

Unnamed: 0,index,Author-Processed_sort,match_sort,score_sort
0,10836,friedrich duerenmatt,friedrich duerrenmatt,98
1,20936,jean christophe grang,jean christophe grange,98
2,28551,bathroom reader institute,bathroom readers institute,98
3,29551,gabriel garacia marquez,gabriel garcia marquez,98
4,18006,judith michael,judith michaels,97
...,...,...,...,...
1015,21766,garth ennis,garth nix,80
1016,21666,joe connelly,john connolly,80
1017,21506,d j conway,deanna j conway,80
1018,8691,harry harrison,kathryn harrison,80


In [8]:
cutoff = 92
author_fuzzy_dict = author_fuzzy_match.loc[author_fuzzy_match["score_sort"]  >= cutoff]
author_fuzzy_dict = author_fuzzy_dict.set_index("Author-Processed_sort")
author_fuzzy_dict = author_fuzzy_dict["match_sort"]
author_fuzzy_dict.to_dict()

{'friedrich duerenmatt': 'friedrich duerrenmatt',
 'jean christophe grang': 'jean christophe grange',
 'bathroom reader institute': 'bathroom readers institute',
 'gabriel garacia marquez': 'gabriel garcia marquez',
 'judith michael': 'judith michaels',
 'fyodor dostoevsky': 'fyodor dostoyevsky',
 'zalata filipovic': 'zlata filipovic',
 'frederic beigbeder': 'frederick beigbeder',
 'philip k howard': 'phillip k howard',
 'beverley donofrio': 'beverly donofrio',
 'william shakespeare': 'william shakspeare',
 'f scott fitzgerald': 'f scott fritzgerald',
 'vonda n mcintryre': 'vonda n mcintyre',
 'ursula k le guin': 'ursula k leguin',
 'laura ingall wilder': 'laura ingalls wilder',
 'carol matthews': 'carole matthews',
 'mariann fredriksson': 'marianne fredriksson',
 'berhard schlink': 'bernhard schlink',
 'elizabet coatsworth': 'elizabeth coatsworth',
 'diana wynne jones': 'dianna wynne jones',
 'barbara de angelis': 'barbara deangelis',
 'daniel de foe': 'daniel defoe',
 'john le carre'

In [9]:
books["Book-Author-Processed"] = books["Book-Author-Processed"].replace(author_fuzzy_dict)

unique = books["Book-Author-Processed"].unique()
print(len(unique))

5813


### Clean Book Publisher:

In [10]:
books["Book-Publisher-Processed"] = books["Book-Publisher"].str.lower()
books["Book-Publisher-Processed"] = books["Book-Publisher-Processed"].str.replace("'", "")
books["Book-Publisher-Processed"] = books["Book-Publisher-Processed"].apply(clean_text).str.replace('  ', ' ').str.strip()

In [11]:
%%time
publisher_fuzzy_match = fuzzy_match_comparison("Publisher-Processed")
publisher_fuzzy_match

1319
CPU times: user 2.76 s, sys: 37.2 ms, total: 2.8 s
Wall time: 2.87 s


Unnamed: 0,index,Publisher-Processed_sort,match_sort,score_sort
0,3091,ullstein buchverlage gmbh & co kg ullstein tas,ullstein buchverlage gmbh & co kg ullstein tasc,99
1,1151,droemersche verlagsanstalt th knaur nachf gmb...,droemersche verlagsanstalt th knaur nachf gmbh...,99
2,266,harpercollins publisher,harpercollins publishers,98
3,4906,harper collins canada,harpercollins canada,98
4,3796,harper collins publishers,harpercollins publishers,98
...,...,...,...,...
427,5062,haynes publications,hysteria publications,80
428,429,knopf books for young readers,random house books for young readers,80
429,5023,alyson publications,arc publications,80
430,1298,pan books ltd,penguin books ltd,80


In [12]:
cutoff = 91
publisher_fuzzy_dict = publisher_fuzzy_match.loc[publisher_fuzzy_match["score_sort"]  >= cutoff]
publisher_fuzzy_dict = publisher_fuzzy_dict.set_index("Publisher-Processed_sort")
publisher_fuzzy_dict = publisher_fuzzy_dict["match_sort"]
publisher_fuzzy_dict.to_dict()

{'ullstein buchverlage gmbh & co kg  ullstein tas': 'ullstein buchverlage gmbh & co kg  ullstein tasc',
 'droemersche verlagsanstalt th knaur nachf  gmbh & co': 'droemersche verlagsanstalt th knaur nachf gmbh & co',
 'harpercollins publisher': 'harpercollins publishers',
 'harper collins canada': 'harpercollins canada',
 'harper collins publishers': 'harpercollins publisher',
 'harper san francisco': 'harpersanfrancisco',
 'harper sanfrancisco': 'harpersanfrancisco',
 'berkeley books': 'berkley books',
 'schoenhof foreign books inc': 'schoenhofsforeign books inc',
 'signet book': 'signet books',
 'tyndale house publishers': 'tyndale house putlishers',
 'da capo press': 'dacapo press',
 'laure leaf': 'laurel leaf',
 'harper mass market paperbacks': 'harper mass market paperbacks mm',
 'regan books': 'reganbooks',
 'random house childrens pub': 'random house childrens pub mm',
 'penguin puffin mass market': 'penguin puffin mass market mm',
 'little brown & company': 'little brown and com

In [13]:
books["Book-Publisher-Processed"] = books["Book-Publisher-Processed"].replace(publisher_fuzzy_dict)

unique = books["Book-Publisher-Processed"].unique()
print(len(unique))

1272


### Clean Book Title:

In [14]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # going to keep stop words in since there are titles with only stop words and everything in title is important
    # Remove stop words
    # tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Return the processed text as a string
    return " ".join(tokens)

def preprocess_dataframe(df, column_name):
    df[f"{column_name}-Processed"] = df[column_name].apply(preprocess_text)
    return df

In [15]:
books = preprocess_dataframe(books, "Book-Title")

### TF-IDF for Book-Title:
Vectorise the book table so we can we can compare

In [16]:
books['Book-Info'] = books['Book-Title-Processed'] + ' by ' + books['Book-Author-Processed'] 

In [17]:
tf = TfidfVectorizer(analyzer = "word", ngram_range=(1,2), min_df=0, stop_words='english')

In [18]:
tfidf_matrix = tf.fit(books['Book-Info'])

In [19]:
tfidf_vectors = tfidf_matrix.transform(books['Book-Info']).toarray()

In [20]:
books['Book-Vector'] = list(tfidf_vectors)

### Impute missing years:

In [21]:
def year_verify(year):
    min_year = 1000
    max_year = 2024
    return (year >= min_year) & (year <= max_year)

raw_year = books["Year-Of-Publication"]
real_year = raw_year.loc[year_verify(raw_year)]
general_median_year = real_year.median()

In [22]:
def median_group(group, index):
    book_group = books[[group, "Year-Of-Publication"]]
    book_group = book_group.loc[book_group[group] == book_group[group][index]]
    median_year = book_group["Year-Of-Publication"]
    median_year = median_year.loc[year_verify(median_year)]
    median_year = median_year.median()
    return median_year

from collections import defaultdict as dd
imputation_method = dd(int)

def imputate_year(index):
    year = books["Year-Of-Publication"][index]
    if (year_verify(year)):
        return year

    groups = ["Book-Title-Processed", "Book-Author-Processed", "Book-Publisher-Processed"]
    for group in groups:
        year = median_group(group, index)
        if (year_verify(year)):
            imputation_method[group] += 1
            return year
        
    imputation_method["Book-All Data"] += 1
    return general_median_year 

books["Year-Of-Publication-Processed"] = pd.Series(books.index).apply(imputate_year)
for key in ["Title-Processed", "Author-Processed", "Publisher-Processed", "All Data"]: 
    print(f'Number of years imputated by grouping {key}: {imputation_method["Book-"+key]}')

Number of years imputated by grouping Title-Processed: 81
Number of years imputated by grouping Author-Processed: 153
Number of years imputated by grouping Publisher-Processed: 64
Number of years imputated by grouping All Data: 19


In [23]:
books["Year-Of-Publication"] = books["Year-Of-Publication-Processed"]
books = books.drop(["Year-Of-Publication-Processed"], axis = 1)

### Discretize Years into Decades:
Reasoning for this is that types of users could like books from a certain decade.

In [24]:
decade_list = [1919 + 10 * x for x in range(10)]

In [25]:
books['Year-Of-Publication-Group'] = pd.cut(x=books['Year-Of-Publication'], bins=decade_list)

In [26]:
encoder = OrdinalEncoder()
encoded_publish_year = encoder.fit_transform(books[['Year-Of-Publication-Group']])
books[['Year-Of-Publication-Group-Encoded']] = encoded_publish_year.astype(int)

In [27]:
books.head(1)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher,Book-Author-Processed,Book-Publisher-Processed,Book-Title-Processed,Book-Info,Book-Vector,Year-Of-Publication-Group,Year-Of-Publication-Group-Encoded
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,richard bruce wright,harperflamingo canada,clara callan,clara callan by richard bruce wright,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1999, 2009]",8


In [28]:
books.to_csv(path + 'cleaned/' + "BX-Books.csv", index=False)