In [10]:
import pandas as pd
import numpy as np
import scipy.stats as ss
from tqdm import tqdm
import re

books = pd.read_csv("../main_code/data/books.csv", encoding='utf-8')

def text_preprocessing_func(text : str ) -> str :
    """
    깨진 문자를 변환하고 특수문자를 삭제하는 함수
    """
    text = text.replace('Ã?Â©️','e') # 원래는 é인데 걍 e로 메움
    text = text.replace('Ã©️', 'e')
    text = text.replace('Ã?Â?','e') # 원래는 é인데 걍 e로 메움
    text = text.lower()   
    text = text.replace('ã','a')
    text = text.replace('\xa0', ' ')
    text = text.replace('â', 'a')
    text = text.replace('\n', ' ')
    text = text.replace('&#39;', "'")
    text = text.replace('&quot;', '')
    text = text.replace('à','a')
    text = text.replace('--', '-')
    text = re.sub(r'[^a-z\d ]', '', text)
    del_list = ['³', 'º', 'ª', '¼', 'µ', '¹', '²', '½']
    for del_word in del_list:
        text = text.replace(del_word, '')
    text = text.strip()
    return text

def year_map(x: int) -> int:
    x = int(x)
    if x > 2000:
        return 1
    elif x >= 1993 and x < 2000:
        return 2
    elif x >= 1989 and x < 1993:
        return 3
    elif x >= 1980 and x < 1989:
        return 4
    elif x >= 1970 and x < 1980:
        return 5
    else:
        return 6

books['year_of_publication'] = books['year_of_publication'].astype(int)

books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip()) # 일단 category에서 대괄호 밖으로 빼기
books['category'] = books['category'].str.lower() # 소문자로 바꾸기

books['category_high'] = books['category'].copy() # category_high로 category를 복사

books['category_high'] = books['category_high'].fillna('Unclassified') # 안 채워진 건 미분류 항목으로 넣기

groupings = {'Fiction': ['fiction', 'ficti'], # 너무 넓으니 맨 위로 빼자
             'Literature & Poem': ['liter', 'poem', 'poetry', 'novel', 'sonnet'],
             'Science & Math': ['science', 'math', 'logy', 'chemis', 'physics', 'electron'], # science, logy 범위가 너무 넓으니 맨 위로
             'Parenting & Relationships': ['baby', 'babies', 'parent', 'family', 'friend', 'tionship', 'brother', 'sister', 'families', 'friendship', 'mother', 'father'], # 좀 큼
             'Medical Books': ['medi', 'psycho'], # psy의 세분화 가능
             'Adventure' : ['adventu'],
             'Animal & Nature': ['animal', 'ecolo', 'plant', 'nature', 'cat', 'dog', 'pets', 'bird', 'bear', 'horse', 'frog', 'duck', 'rabbit', 'dolphin', 'mice', 'deer',
             'panda', 'kangaroo', 'lizzard', 'gorilla', 'chimpangee', 'bat', 'insect'],
             'Arts & Photography': ['art', 'photo', 'drawing', 'picture'], # art는 겹치는 글자가 너무 많음
             'Authors' : ['authors'],
             'Biographies & Memoirs': ['biog', 'memo'],
             'Business & Money': ['busi', 'money', 'econo', 'finance'],
             'Calendars': ['calen'],
             'Children\'s Books': ['child', 'baby'],
             'Christian Books & Bibles': ['christi', 'bible'], #크리스마스때매
             'Christmas' : ['christma'],
             'Comics & Graphic Novels': ['comics', 'graphic novel'],
             'Computers & Technology': ['computer', 'techno', 'archi'],
             'Cookbooks, Food & Wine': ['cook', 'wine', 'food'],
             'Countries & Cities' : ['united states', 'russia', 'france', 'africa', 'china', 'japan', 'egypt', 'germany', 'ireland', 'california', 'berline', 'london', 'new york', 'canada',
             'chile', 'italy', 'europe', 'australia', 'great britain', 'arizona', 'chicago', 'netherlands', 'calif', 'mexico', 'colombia', 'greece',
             'florida', 'algeria', 'new zealand', 'austria', 'denmark', 'washington', 'india', 'england', 'brazil'],
             'Crafts, Hobbies & Home': ['crafts', 'hobb', 'home', 'house', 'garden'],
             'Crime & Murder' : ['crime', 'murder', 'criminal', 'homicide', 'mafia', 'gang', 'drug'],
             'Critic' : ['critic'],
             'Education & Teaching': ['educa', 'teach'],
             'Drama' : ['drama'],
             'Design' : ['design'],
             'Engineering & Transportation': ['engine', 'transp'],
             'Encyclopedia & Dictionary' : ['encyclo', 'dictiona', 'vocabulary'],
             'Essay' : ['essay'],
             'Health, Fitness & Dieting': ['health', 'fitness', 'diet'],
             'History': ['histo', 'war'],
             'Humor & Entertainment': ['humor', 'entertai', 'comed', 'game', 'comic'],
             'Law': ['law'],
             'Language' : ['language'],
             'LGBTQ+ Books': ['lesbian', 'gay', 'bisex'],
             'Mystery, Thriller & Suspense': ['myste', 'thril', 'suspen', 'horror', 'occult'],
             'Music & Dance' : ['music', 'dance', 'instrument', 'ballet', 'classic'],
             'Movie' : ['motion pictur', 'actor', 'actres', 'acting', 'cinema', 'theater', 'director', 'television'],
             'Politics': ['politic', 'president'],
             'Philosophy' : ['philoso'],
             'Reference': ['reference'],
             'Religion & Spirituality': ['religi', 'buddh', 'spirit', 'god', 'prayer', 'belief', 'doubt'],
             'Romance': ['romance'],
             'Science Fiction & Fantasy': ['imagin', 'science fiction', 'fantasy', 'fairy', 'fairies', 'vampire', 'epidemic', 'ghost', 'alien', 'supernatural', 'magic', 'dragons', 'elves', 'angel', 'devil'],
             'Short story' : ['short'],
             'Social Science' : ['social', 'ethic', 'communism', 'capitalism', 'generation', 'culture'],
             'Self-Help': ['self'], # self 검색시 모두 자기계발 관련
             'Study': ['test', 'school', 'examina', 'study aids', 'college'],
             'Sports & Outdoors': ['exerc','sport','outdoor', 'baseball', 'soccer', 'hockey', 'cricket', 'basketball', 'footbal'],
             'Teen & Young Adult': ['teen', 'adol', 'juven'], #nonfiction이란 말은 청소년 관련뿐
             'Travel': ['travel'],
             'Women' : ['women'],
             }

for new_group, small in groupings.items(): # 파편화된 카테고리를 새 그룹으로 묶어주기
    for s in small:
        books.loc[books[books['category'].str.contains(s, na = False)].index, 'category_high'] = new_group