In [59]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from fuzzywuzzy import process, fuzz

In [60]:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor

filename = "preprocessing_title.ipynb"
with open(filename) as ff:
    nb_in = nbformat.read(ff, nbformat.NO_CONVERT)
    
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')

nb_out = ep.preprocess(nb_in)

In [61]:
def fuzzy_match_comparison(col):
    
    unique = books[f"Book-{col}"].unique()
    print(len(unique))
    
    score_sort = [(x,) + i
             for x in unique
             for i in process.extract(x, unique, scorer=fuzz.ratio)] 
    
    similarity_sort = pd.DataFrame(score_sort, columns=[f'{col}_sort','match_sort','score_sort'])
    similarity_sort[f'sorted_{col}_sort'] = np.minimum(similarity_sort[f'{col}_sort'], similarity_sort['match_sort'])

    high_score_sort = \
    similarity_sort[(similarity_sort['score_sort'] >= 80) & \
                    (similarity_sort[f'{col}_sort'] !=  similarity_sort['match_sort']) & \
                    (similarity_sort[f'sorted_{col}_sort'] != similarity_sort['match_sort'])] 
    high_score_sort = high_score_sort.drop(f'sorted_{col}_sort',axis=1).copy()

    high_score_sort = high_score_sort.groupby([f'{col}_sort','score_sort']).agg(
                        {'match_sort': ', '.join}).sort_values(
                        ['score_sort'], ascending=False).reset_index()

    return high_score_sort

In [62]:
books = pd.read_csv('BX-Cleaned-Books.csv')

In [63]:
# Define regex pattern to remove non-alphanumeric characters except '&'
pattern = r'[^a-zA-Z0-9& ]'

# Function to clean text based on regex pattern
def clean_text(text):
    cleaned_text = re.sub(pattern, ' ', text)
    return cleaned_text

books["Book-Author-Processed"] = books["Book-Author"].str.lower()
books["Book-Author-Processed"] = books["Book-Author-Processed"].str.replace("'", "")
books["Book-Author-Processed"] = books["Book-Author-Processed"].apply(clean_text).str.replace('  ', ' ')




In [64]:
author_fuzzy_match = fuzzy_match_comparison("Author-Processed")


5925


In [65]:
author_fuzzy_match

Unnamed: 0,Author-Processed_sort,score_sort,match_sort
0,jean christophe grang,98,jean christophe grange
1,gabriel garacia marquez,98,gabriel garcia marquez
2,friedrich duerenmatt,98,friedrich duerrenmatt
3,zalata filipovic,97,zlata filipovic
4,laura ingall wilder,97,laura ingalls wilder
...,...,...,...
1001,james dos,80,james jones
1002,patricia conner,80,"patricia foster, patricia potter"
1003,david farland,80,david kaplan
1004,patricia c wrede,80,patricia hedge


In [66]:
cutoff = 92
author_fuzzy_dict = author_fuzzy_match.loc[author_fuzzy_match["score_sort"]  >= cutoff]
author_fuzzy_dict = author_fuzzy_dict.set_index("Author-Processed_sort")
author_fuzzy_dict = author_fuzzy_dict["match_sort"]
author_fuzzy_dict.to_dict()

{'jean christophe grang': 'jean christophe grange',
 'gabriel garacia marquez': 'gabriel garcia marquez',
 'friedrich duerenmatt': 'friedrich duerrenmatt',
 'zalata filipovic': 'zlata filipovic',
 'laura ingall wilder': 'laura ingalls wilder',
 'mariann fredriksson': 'marianne fredriksson',
 'fyodor dostoevsky': 'fyodor dostoyevsky',
 'barbara de angelis': 'barbara deangelis',
 'frederic beigbeder': 'frederick beigbeder',
 'f scott fitzgerald': 'f scott fritzgerald',
 'vonda n mcintryre': 'vonda n mcintyre',
 'elizabet coatsworth': 'elizabeth coatsworth',
 'berhard schlink': 'bernhard schlink',
 'beverley donofrio': 'beverly donofrio',
 'patricia mc killip': 'patricia mckillip',
 'william shakespeare': 'william shakspeare',
 'philip k howard': 'phillip k howard',
 'diana wynne jones': 'dianna wynne jones',
 'ursula k le guin': 'ursula k leguin',
 'niccol machiavelli': 'niccolo machiavelli',
 'stephen frey': 'stephen w frey',
 'carol matthew': 'carole matthew',
 'paul coelho': 'paulo co

In [67]:
books["Book-Author-Processed"] = books["Book-Author-Processed"].replace(author_fuzzy_dict)

unique = books[f"Book-Author-Processed"].unique()
print(len(unique))

5803


In [68]:
books["Book-Publisher-Processed"] = books["Book-Publisher"].str.lower()
books["Book-Publisher-Processed"] = books["Book-Publisher-Processed"].str.replace("'", "")
books["Book-Publisher-Processed"] = books["Book-Publisher-Processed"].apply(clean_text).str.replace('  ', ' ')

In [69]:
publisher_fuzzy_match = fuzzy_match_comparison("Publisher-Processed")
publisher_fuzzy_match

1299


Unnamed: 0,Publisher-Processed_sort,score_sort,match_sort
0,harper collins publisher,98,harpercollins publisher
1,ullstein buchverlage gmbh co kg ullstein ta,98,ullstein buchverlage gmbh co kg ullstein tasc
2,harper collins canada,98,harpercollins canada
3,harper sanfrancisco,97,harpersanfrancisco
4,harper collins uk,97,harpercollins uk
...,...,...,...
348,paperstar book,80,persea book
349,pan publishing,80,sport publishing
350,pan book,80,pan book ltd
351,st martin pr mm,80,st martin press


In [70]:
cutoff = 91
publisher_fuzzy_dict = publisher_fuzzy_match.loc[publisher_fuzzy_match["score_sort"]  >= cutoff]
publisher_fuzzy_dict = publisher_fuzzy_dict.set_index("Publisher-Processed_sort")
publisher_fuzzy_dict = publisher_fuzzy_dict["match_sort"]
publisher_fuzzy_dict.to_dict()

{'harper collins publisher': 'harpercollins publisher',
 'ullstein buchverlage gmbh co kg ullstein ta': 'ullstein buchverlage gmbh co kg ullstein tasc',
 'harper collins canada': 'harpercollins canada',
 'harper sanfrancisco': 'harpersanfrancisco',
 'harper collins uk': 'harpercollins uk',
 'harper san francisco': 'harpersanfrancisco',
 'da capo press': 'dacapo press',
 'berkeley book': 'berkley book',
 'schoenhof foreign book inc': 'schoenhofsforeign book inc',
 'plaza janes editor': 'plaza janes editores',
 'penguin u': 'penguin uk',
 'laure leaf': 'laurel leaf',
 'penguin puffin mass market': 'penguin puffin mass market mm',
 'random house childrens pub': 'random house childrens pub mm',
 'harper mass market paperback': 'harper mass market paperback mm',
 'tyndale house publisher': 'tyndale house putlishers',
 'new amer library classic': 'new amer library classic mm',
 'crown publishing group': 'orion publishing group',
 'deutscher taschenbuch verlag': 'deutscher taschenbuch verlag 

In [71]:
books["Book-Publisher-Processed"] = books["Book-Publisher-Processed"].replace(publisher_fuzzy_dict)

unique = books[f"Book-Publisher-Processed"].unique()
print(len(unique))

1260


In [72]:
books["Book-Publisher"] = books["Book-Publisher-Processed"]
books["Book-Author"] = books["Book-Author-Processed"]
books = books.drop(["Book-Publisher-Processed", "Book-Author-Processed"], axis = 1)

In [73]:
def year_verify(year):
    min_year = 1000
    max_year = 2024
    return (year >= min_year) & (year <= max_year)

raw_year = books["Year-Of-Publication"]
real_year = raw_year.loc[year_verify(raw_year)]
general_median_year = real_year.median()


In [74]:
def median_group(group, index):
    book_group = books[[group, "Year-Of-Publication"]]
    book_group = book_group.loc[book_group[group] == book_group[group][index]]
    median_year = book_group["Year-Of-Publication"]
    median_year = median_year.loc[year_verify(median_year)]
    median_year = median_year.median()
    return median_year

from collections import defaultdict as dd
imputation_method = dd(int)

def imputate_year(index):
    year = books["Year-Of-Publication"][index]
    if (year_verify(year)):
        return year

    groups = ["Book-Title", "Book-Author", "Book-Publisher"]
    for group in groups:
        year = median_group(group, index)
        if (year_verify(year)):
            imputation_method[group] += 1
            return year
        
    imputation_method["Book-All Data"] += 1
    return general_median_year 

books["Processed-Year"] = pd.Series(books.index).apply(imputate_year)
for key in ["Title", "Author", "Publisher", "All Data"]: 
    print(f'Number of years imputated by grouping {key}: {imputation_method["Book-"+key]}')

Number of years imputated by grouping Title: 106
Number of years imputated by grouping Author: 131
Number of years imputated by grouping Publisher: 63
Number of years imputated by grouping All Data: 17


In [75]:
books["Year-Of-Publication"] = books["Processed-Year"]
books = books.drop(["Processed-Year"], axis = 1)

In [76]:
books.to_csv("BX-Cleaned-Books.csv", index=False)