# Handling Category
---
변수별로 threshold 정한 기준:  
Clustering의 개수를 정할 때 사용하는 elbow method에서 착안.  
카디널리티가 급격하게 줄어드는 값으로 threshold 설정

In [2]:
import pandas as pd
import numpy as np

In [12]:
books = pd.read_csv('data/books_1.0.csv')
users = pd.read_csv('data/users_f_location_1.1.csv')
ratings = pd.read_csv('data/train_ratings.csv')

## user 카테고리 매핑
---

user의 `country` 카디널리티 줄이기

In [175]:
user_country_vc = users.location_country.value_counts()
etc_country = user_country_vc.where(user_country_vc < 30).dropna().index
many_country = user_country_vc.where(user_country_vc >= 30).dropna().index

v = len(users.location_country.unique()) - len(etc_country)

country2etc = {country:v+1 for country in etc_country}
country2many = {country:idx for idx, country in enumerate(many_country)}
country2etc.update(country2many)

users.location_country = users.location_country.map(country2etc)


`state`, `city`는 카디널리티가 너무 높으므로 변수 drop

In [176]:
users = users.drop(['location_city', 'location_state'], axis=1)

나이 연령대로 매핑

In [177]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [178]:
# Null 값은 평균으로 대치
users = users.fillna(users.age.mean())
users.age = users.age.apply(age_map)

In [179]:
users

Unnamed: 0,user_id,age,location_country
0,8,3,1.000000
1,11400,4,1.000000
2,67544,3,1.000000
3,85526,3,1.000000
4,96054,2,1.000000
...,...,...,...
68087,156948,3,36.069873
68088,169489,3,36.069873
68089,56072,3,36.069873
68090,64582,3,36.069873


## books 데이터 카테고리 매핑

카디널리티가 너무 높은 변수 drop

In [180]:
books = books.drop(['book_title', 'summary', 'img_path', 'img_url'], axis=1)

book_author 변수 정규화 

In [181]:
import re
books.book_author = books['book_author'].str.strip().str.lower().apply(lambda x: re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", x))

작가별 집필한 책 수

박경리: 15 권쯤  
박완서: 40 권쯤  
신경숙: 15 권쯤  
베르나르 베르베르: 15권쯤 

In [182]:
book_author_vc = books.book_author.value_counts()
etc_author = book_author_vc.where(book_author_vc < 15).dropna().index
many_author = book_author_vc.where(book_author_vc >= 15).dropna().index

v = len(books.book_author.unique()) - len(etc_author)

author2etc = {author:v+1 for author in etc_author}
author2many = {author:idx  for idx, author in enumerate(many_author)}
author2etc.update(author2many)

books.book_author = books.book_author.map(author2etc)

year_of_publication 변수 카테고리화  
1376~ 2006년 사이 출판

In [183]:
books.year_of_publication.value_counts().index.sort_values()

Float64Index([1376.0, 1378.0, 1806.0, 1900.0, 1901.0, 1902.0, 1904.0, 1906.0,
              1908.0, 1911.0, 1920.0, 1923.0, 1924.0, 1925.0, 1926.0, 1927.0,
              1928.0, 1929.0, 1930.0, 1931.0, 1932.0, 1933.0, 1934.0, 1935.0,
              1936.0, 1937.0, 1938.0, 1939.0, 1940.0, 1941.0, 1942.0, 1943.0,
              1944.0, 1945.0, 1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0,
              1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0,
              1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0,
              1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0,
              1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0,
              1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0,
              1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0,
              2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0],
             dtype='float64')

In [184]:
def publish_map(x: int) -> int:
    x = int(x)
    if x < 1940:
        return 1
    elif x >= 1940 and x < 1950:
        return 2
    elif x >= 1950 and x < 1960:
        return 3
    elif x >= 1960 and x < 1970:
        return 4
    elif x >= 1970 and x < 1980:
        return 5
    elif x >= 1980 and x < 1990:
        return 6
    elif x >= 1990 and x < 2000:
        return 7
    else:
        return 8

In [185]:
books.year_of_publication = books.year_of_publication.apply(publish_map)

### publisher 처리

In [186]:
books_publisher_vc = books.publisher.value_counts()
etc_publisher = books_publisher_vc.where(books_publisher_vc < 20).dropna().index
many_publisher = books_publisher_vc.where(books_publisher_vc >= 20).dropna().index

v = len(books.publisher.unique()) - len(etc_publisher)
publisher2etc = {publisher:v+1 for publisher in etc_publisher}
publisher2many = {publisher:idx for idx, publisher in enumerate(many_publisher)}
publisher2etc.update(publisher2many)

books.publisher = books.publisher.map(publisher2etc)

### category 처리

In [198]:
books_category_vc = books.category.value_counts()
etc_category = books_category_vc.where(books_category_vc < 10).dropna().index
many_category = books_category_vc.where(books_category_vc >= 10).dropna().index

v = len(books.category.unique()) - len(etc_category)
category2etc =     {category:v+1 for category in etc_category}
category2many =    {category:idx for idx, category in enumerate(many_category)}
category2etc.update(category2many)

books.category = books.category.map(category2etc)
books = books.fillna(-1)

### language 처리

In [219]:
books_language_vc =  books.language.value_counts()
etc_lang = books_language_vc.where(books_language_vc < 100).dropna().index
many_lang = books_language_vc.where(books_language_vc >= 100).dropna().index

v = len(books.language.unique()) - len(etc_lang)
lang2etc =     {lang:v+1 for lang in etc_lang}
lang2many =    {lang:idx for idx, lang in enumerate(many_lang)}
lang2etc.update(lang2many)

books.language = books.language.map(lang2etc)

## 저장

In [223]:
users.to_csv('cate_users.csv', index=None)
books.to_csv('cate_books.csv', index=None)

# 전체 카테고리 변수에 대해 일괄적인 threshold 값 적용 exp

In [None]:
def get_core(x):
        if x in l:
            return
        else:
            return 'others'

In [None]:
THRESHOLD = 120

books_cate_col = ['category', 'language','publisher','book_author']
for col in books_cate_col:

    l = list(books[col].value_counts().where(books[col].value_counts() >= THRESHOLD).dropna().index)
    books[col] = books[col].apply(get_core)

In [None]:
users_cate_col = ['location_country']
for col in users_cate_col:
    l = list(users[col].value_counts().where(users[col].value_counts() >= THRESHOLD).index)
    users[col] = users[col].apply(get_core)

# 결론

FM으로 성능을 실험한 결과 일괄적인 Threshold를 적용했을 때 더 좋은 성능을 보여줬다.