# MissForest 로 결측치 처리하기


**MissForest**
- 개념
    - Random Forest 알고리즘으로 missing value 예측하는 것
- 장점
    - 어떤 feature가 중요한지 random forest를 통해 알 수 있음
    - tuning 필요 없음 (KNN의 K 결정 문제 해결)
    - categorical data type 도 가능
- Refrerence
    - [MissForest: The Best Missing Data Imputation Algorithm?](https://towardsdatascience.com/missforest-the-best-missing-data-imputation-algorithm-4d01182aed3)
    - [How to Use Python and MissForest Algorithm to Impute Missing Data](https://towardsdatascience.com/how-to-use-python-and-missforest-algorithm-to-impute-missing-data-ed45eb47cb9a)


In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

In [None]:
# users_v2, books_v3 사용하여 차원 줄여보기 
base_path = os.path.join(os.curdir, 'data')
userv2_path = os.path.join(base_path, 'users_v2.csv') # age를 random 추출
bookv3_path = os.path.join(base_path, 'books_v3.csv') # category를 줄인 상태 
rating_path = os.path.join(base_path, 'train_ratings.csv')

In [None]:
# 파일 불러와서 차원 파악
usersv2 = pd.read_csv(userv2_path, encoding='utf-8')
bookv3 = pd.read_csv(bookv3_path, encoding='utf-8')
ratings = pd.read_csv(rating_path, encoding='utf-8')

merge_ = ratings.merge(bookv3, how='left', on='isbn')
data = merge_.merge(usersv2, how='inner', on='user_id')
data.nunique()

In [None]:
# bookv3을 data_amp로 copy
data_amp = data.copy()

In [None]:
# top 10 출판사 data만 추출
top10_publisher = data_amp['publisher'].value_counts()[:10].index.tolist()
data_amp = data_amp[data_amp['publisher'].isin(top10_publisher)]
data_amp.nunique()

In [None]:
# Top 100 작가 data만 추출
top100_author = data_amp['book_author'].value_counts()[:100].index.tolist()
data_amp = data_amp[data_amp['book_author'].isin(top100_author)]
data_amp.nunique()

In [None]:
# category 추정에 publisher, book_author 만을 고려 
data_amp = data_amp[['rating', 'category', 'publisher', 'book_author']]
data_amp.dropna(inplace=True) # 결측치가 있으면 제외 
data_amp.reset_index(inplace=True)

In [None]:
# 필요한 부분만 추출한 data_amp 확인
data_amp.head()

In [None]:
# categorical data 처리를 위해 LabelEncoder 사용
from sklearn.preprocessing import LabelEncoder

In [None]:
# publisher와 book_author label encoding 후 데이터 대치 
encoder = LabelEncoder()
data_amp['category'] = encoder.fit_transform(data_amp['category'])
data_amp['publisher'] = encoder.fit_transform(data_amp['publisher'])
data_amp['book_author'] = encoder.fit_transform(data_amp['book_author'])

In [None]:
data_amp['category'].value_counts()

In [None]:
# error 파악을 위해 기존 데이터 복사
data_orig = data_amp.copy()
data_orig.info()

In [None]:
# 랜덤하게 index를 뽑아 NaN 으로 만들기 
idx = list(set(np.random.randint(0, len(data_amp), 100)))
data_amp['category'] = [val if i not in idx else np.nan for i, val in enumerate(data_amp['category'])]
data_amp.isna().sum()

In [None]:
imputer = MissForest()
X = data_amp.drop('rating', axis=1)
X_imputed = imputer.fit_transform(X, cat_vars=[1,2,3])
X_imputed

In [None]:
# original 데이터와 비교 
data_orig['imputed_category'] = X_imputed[:, 1]
comparison_df = data_orig[['category', 'imputed_category']]

In [None]:
comparison_df.iloc[idx]

## 성능 평가

어떻게 하면 좋을지 고민중

In [None]:
# fbeta score 확인
from sklearn.metrics import fbeta_score
comparison_df['fbeta_score'] = fbeta_score(comparison_df['category'], comparison_df['imputed_category'], beta=0.5)

## 주의님 데이터로 실험하기
- 사용 데이터
    - books_topic_modeling_v2.csv
    - summary clustering으로 뽑아낸 summary_topic
    - category clustering으로 뽑아낸 category_topic

In [96]:
topic_path = os.path.join(base_path, 'books_topic_modeling_v2.csv')
topic_book = pd.read_csv(topic_path, encoding='utf-8')
merge_ = ratings.merge(topic_book, how='left', on='isbn')
data_topic = merge_.merge(usersv2, how='inner', on='user_id')
data_topic.nunique()

user_id                 59803
isbn                   129777
rating                     10
book_title             117729
book_author             54716
year_of_publication        92
publisher                1408
img_url                129777
language                   24
category                 3715
summary                 69779
img_path               129777
summary_topic             350
summary_topic_name        350
category_topic            112
category_topic_name       112
location                13888
age                        91
dtype: int64

In [97]:
data_topic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306795 entries, 0 to 306794
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              306795 non-null  int64  
 1   isbn                 306795 non-null  object 
 2   rating               306795 non-null  int64  
 3   book_title           306795 non-null  object 
 4   book_author          306795 non-null  object 
 5   year_of_publication  306795 non-null  int64  
 6   publisher            306795 non-null  object 
 7   img_url              306795 non-null  object 
 8   language             187711 non-null  object 
 9   category             185574 non-null  object 
 10  summary              187709 non-null  object 
 11  img_path             306795 non-null  object 
 12  summary_topic        187711 non-null  float64
 13  summary_topic_name   187711 non-null  object 
 14  category_topic       185574 non-null  float64
 15  category_topic_na

### 실험 1. category_topic만 보기 

In [98]:
data_topic_amp = data_topic.copy()

In [99]:
# top 20 출판사 data만 추출
top20_publisher = data_topic_amp['publisher'].value_counts()[:20].index.tolist()
data_topic_amp = data_topic_amp[data_topic_amp['publisher'].isin(top20_publisher)]
data_topic_amp.nunique()

user_id                39827
isbn                   53538
rating                    10
book_title             48253
book_author            19024
year_of_publication       76
publisher                 20
img_url                53538
language                  10
category                1490
summary                31546
img_path               53538
summary_topic            342
summary_topic_name       342
category_topic           112
category_topic_name      112
location                9695
age                       86
dtype: int64

In [100]:
# Top 1000 작가 data만 추출
top1000_author = data_topic_amp['book_author'].value_counts()[:1000].index.tolist()
data_topic_amp = data_topic_amp[data_topic_amp['book_author'].isin(top1000_author)]
data_topic_amp.nunique()

user_id                39897
isbn                   30043
rating                    10
book_title             24556
book_author             1000
year_of_publication       80
publisher                564
img_url                30043
language                  11
category                 986
summary                15174
img_path               30043
summary_topic            303
summary_topic_name       303
category_topic           112
category_topic_name      112
location               10247
age                       89
dtype: int64

In [101]:
# Top 500 category data만 추출
top500_category = data_topic_amp['category'].value_counts()[:500].index.tolist()
data_topic_amp = data_topic_amp[data_topic_amp['category'].isin(top500_category)]
data_topic_amp.nunique()

user_id                29425
isbn                   15306
rating                    10
book_title             13505
book_author              996
year_of_publication       73
publisher                387
img_url                15306
language                  10
category                 500
summary                14355
img_path               15306
summary_topic            301
summary_topic_name       301
category_topic           101
category_topic_name      101
location                8192
age                       88
dtype: int64

In [102]:
# category 추정에 publisher, book_author 만을 고려 
data_topic_amp = data_topic_amp[['rating', 'category_topic', 'category', 'publisher', 'book_author']]
data_topic_amp.dropna(inplace=True) # 결측치가 있으면 제외 
data_topic_amp.reset_index(inplace=True)

In [103]:
data_topic_amp.head(10)

Unnamed: 0,index,rating,category_topic,category,publisher,book_author
0,11,7,33.0,fiction,Dell Publishing Company,Belva Plain
1,18,10,33.0,fiction,Dell Publishing Company,DIANA GABALDON
2,23,6,-1.0,brooklyn,Bantam Books,William Styron
3,24,6,33.0,fiction,St. Martin's Press,Gail Tsukiyama
4,25,7,33.0,fiction,Riverhead Books,Sarah Waters
5,27,6,33.0,fiction,Ivy Books,Anna Quindlen
6,31,7,91.0,cities and towns,Warner Books,Billie Letts
7,36,5,33.0,fiction,Pocket,Larry McMurtry
8,48,10,33.0,fiction,Warner Books,Harper Lee
9,49,9,33.0,fiction,Perennial,Gabriel Garcia Marquez


In [104]:
encoder2 = LabelEncoder()
data_topic_amp['category'] = encoder2.fit_transform(data_topic_amp['category'])
data_topic_amp['publisher'] = encoder2.fit_transform(data_topic_amp['publisher'])
data_topic_amp['book_author'] = encoder2.fit_transform(data_topic_amp['book_author'])

In [105]:
data_topic_amp['category_topic'].value_counts()

 33.0     71285
-1.0       9416
 67.0      2479
 40.0      1557
 2.0        537
          ...  
 96.0         3
 59.0         3
 105.0        3
 98.0         3
 57.0         3
Name: category_topic, Length: 101, dtype: int64

In [106]:
# error 파악을 위해 기존 데이터 복사
data_topic_orig = data_topic_amp.copy()
data_topic_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94423 entries, 0 to 94422
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           94423 non-null  int64  
 1   rating          94423 non-null  int64  
 2   category_topic  94423 non-null  float64
 3   category        94423 non-null  int64  
 4   publisher       94423 non-null  int64  
 5   book_author     94423 non-null  int64  
dtypes: float64(1), int64(5)
memory usage: 4.3 MB


In [108]:
# 랜덤하게 index를 뽑아 NaN 으로 만들기 
idx = list(set(np.random.randint(0, len(data_topic_amp), 1000)))
data_topic_amp['category_topic'] = [val if i not in idx else np.nan for i, val in enumerate(data_topic_amp['category_topic'])]
data_topic_amp.isna().sum()

index                0
rating               0
category_topic    1095
category             0
publisher            0
book_author          0
dtype: int64

In [110]:
data_topic_amp.drop('rating', axis=1)

Unnamed: 0,index,category_topic,category,publisher,book_author
0,11,33.0,329,80,93
1,18,33.0,329,80,189
2,23,-1.0,150,28,988
3,24,33.0,329,322,319
4,25,33.0,329,281,841
...,...,...,...,...,...
94418,306702,33.0,329,272,54
94419,306705,33.0,329,364,284
94420,306725,33.0,329,251,579
94421,306735,2.0,469,385,938


In [None]:
imputer2 = MissForest()
X = data_topic_amp.drop('rating', axis=1)
X_imputed = imputer2.fit_transform(X, cat_vars=[1,2,3,4])
X_imputed

In [112]:
data_topic_orig['imputed_topic'] = X_imputed[:, 1]
comparison_topic_df = data_topic_orig[['category_topic', 'imputed_topic']]

In [115]:
# 틀린 건 2개 뿐
new_df = comparison_topic_df.iloc[idx]
new_df[new_df['category_topic'] != new_df['imputed_topic']]

Unnamed: 0,category_topic,imputed_topic
54738,-1.0,4.0
94191,31.0,-1.0


In [121]:
data_topic[data_topic['category_topic'] == -1]  # -1_darstellung_belletristische_geschichte_dutch	
data_topic[data_topic['category_topic'] == 4]   # 4_art_artists_painting_creative	
data_topic[data_topic['category_topic'] == 31]  # 31_canadian_canada_biography_québec	

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,summary_topic,summary_topic_name,category_topic,category_topic_name,location,age
2674,11676,0099740516,8,Fall on Your Knees,Ann-Marie MacDonald,1997,Trafalgar Square,http://images.amazon.com/images/P/0099740516.0...,en,canadian fiction,This is a story of family relationships racial...,images/0099740516.01.THUMBZZZ.jpg,-1.0,-1_39_quot_life_story,31.0,31_canadian_canada_biography_québec,",,",28.0
5069,11676,0771091931,9,The Edible Woman,MARGARET ATWOOD,1973,McClelland & Stewart,http://images.amazon.com/images/P/0771091931.0...,en,canadian fiction,What happens to someone who has been a willing...,images/0771091931.01.THUMBZZZ.jpg,16.0,16_business_marketing_leadership_management,31.0,31_canadian_canada_biography_québec,",,",28.0
13108,132930,1857027051,6,Larrys Party,Carol Shields,1994,Orion Publishing Co,http://images.amazon.com/images/P/1857027051.0...,en,domestic fiction canadian,The new novel from the Pulitzer Prize winning ...,images/1857027051.01.THUMBZZZ.jpg,345.0,345_pulitzer_prize_wapshot_middlesex,31.0,31_canadian_canada_biography_québec,"wokingham,england,unitedkingdom",29.0
19613,168047,0553247409,10,Anne of Avonlea (Anne of Green Gables Novels (...,Lucy Maud Montgomery,1984,Bantam Books,http://images.amazon.com/images/P/0553247409.0...,en,canada,Anne 39 s determination provides the same quot...,images/0553247409.01.THUMBZZZ.jpg,-1.0,-1_39_quot_life_story,31.0,31_canadian_canada_biography_québec,"oxford,england,unitedkingdom",28.0
21241,249862,0553269224,10,Rilla of Ingleside (Anne of Green Gables Novel...,L. M. Montgomery,1985,Bantam Books,http://images.amazon.com/images/P/0553269224.0...,en,canadian literature,The series begins as Anne an eleven year old o...,images/0553269224.01.THUMBZZZ.jpg,4.0,4_mother_sister_father_daughter,31.0,31_canadian_canada_biography_québec,"sugarland,texas,usa",36.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304687,263156,1852279761,9,Joni Mitchell: Shadows and Light the Definitiv...,Karen O'Brien,2003,Serpent's Tail,http://images.amazon.com/images/P/1852279761.0...,en,composers canada biography,This biography charts the life and loves of Jo...,images/1852279761.01.THUMBZZZ.jpg,-1.0,-1_39_quot_life_story,31.0,31_canadian_canada_biography_québec,"saultstemarie,ontario,canada",35.0
305127,267091,0006481078,7,A Year of Lesser: A Novel,David Bergen,1997,HarperCollins Publishers,http://images.amazon.com/images/P/0006481078.0...,en,canadian fiction,quot quot As time catches up with Johnny Lorai...,images/0006481078.01.THUMBZZZ.jpg,21.0,21_romance_love_romantic_passion,31.0,31_canadian_canada_biography_québec,"georgetown,sevenmilebeach,caymanislands",41.0
305411,213804,0688007562,4,The Canadian caper,Jean Pelletier,1981,Harpercollins,http://images.amazon.com/images/P/0688007562.0...,en,canada foreign relations iran,Presents an account of the six Americans who w...,images/0688007562.01.THUMBZZZ.jpg,-1.0,-1_39_quot_life_story,31.0,31_canadian_canada_biography_québec,"iola,kansas,usa",15.0
305444,215049,2760918068,6,"Quarante-quatre minutes, quarante-quatre secon...",Michel Tremblay,1997,Actes sud,http://images.amazon.com/images/P/2760918068.0...,fr,french canadian fiction,Trente ans plus tard alcoolique et revenu de t...,images/2760918068.01.THUMBZZZ.jpg,7.0,7_de_la_un_le,31.0,31_canadian_canada_biography_québec,"montreal,quebec,canada",46.0


### 실험 2. category_topic, summary_topic 같이 보기

In [131]:
data_summary_topic_amp = data_topic.copy()

In [127]:
# # 모든 조건을 만족시키면서 NaN 이 없는게 없는듯 -> 이 방법으로 실험하지 말자 
# data_summary_topic_amp = data_summary_topic_amp[data_summary_topic_amp['publisher'].isin(top20_publisher)]
# data_summary_topic_amp = data_summary_topic_amp[data_summary_topic_amp['book_author'].isin(top1000_author)]
# data_summary_topic_amp = data_summary_topic_amp[data_summary_topic_amp['category'].isin(top500_category)]

In [132]:
# category 추정에 publisher, book_author 만을 고려 
data_summary_topic_amp = data_summary_topic_amp[['rating', 'summary_topic', 'category_topic', 'category', 'publisher', 'book_author']]
data_summary_topic_amp.dropna(inplace=True) # 결측치가 있으면 제외 
data_summary_topic_amp.reset_index(inplace=True)

In [133]:
encoder3 = LabelEncoder()
data_summary_topic_amp['category'] = encoder3.fit_transform(data_summary_topic_amp['category'])
data_summary_topic_amp['publisher'] = encoder3.fit_transform(data_summary_topic_amp['publisher'])
data_summary_topic_amp['book_author'] = encoder3.fit_transform(data_summary_topic_amp['book_author'])

In [134]:
# error 파악을 위해 기존 데이터 복사
data_summary_topic_orig = data_summary_topic_amp.copy()
data_summary_topic_orig.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185574 entries, 0 to 185573
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   index           185574 non-null  int64  
 1   rating          185574 non-null  int64  
 2   summary_topic   185574 non-null  float64
 3   category_topic  185574 non-null  float64
 4   category        185574 non-null  int64  
 5   publisher       185574 non-null  int64  
 6   book_author     185574 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 9.9 MB


In [135]:
# 랜덤하게 index를 뽑아 NaN 으로 만들기 
idx = list(set(np.random.randint(0, len(data_summary_topic_amp), 10000)))
data_summary_topic_amp['category_topic'] = [val if i not in idx else np.nan for i, val in enumerate(data_summary_topic_amp['category_topic'])]
data_summary_topic_amp.isna().sum()

index                0
rating               0
summary_topic        0
category_topic    9716
category             0
publisher            0
book_author          0
dtype: int64

In [137]:
data_summary_topic_amp.drop('rating', axis=1).head()

Unnamed: 0,index,summary_topic,category_topic,category,publisher,book_author
0,0,-1.0,108.0,58,165,27331
1,1,-1.0,33.0,2089,570,1503
2,5,-1.0,63.0,2973,406,28004
3,6,-1.0,33.0,2089,363,17380
4,7,-1.0,108.0,58,165,27331


In [None]:
imputer3 = MissForest()
X = data_summary_topic_amp.drop('rating', axis=1)
X_imputed3 = imputer3.fit_transform(X, cat_vars=[1,2,3,4,5])
X_imputed3

In [140]:
data_summary_topic_orig['imputed_category_topic'] = X_imputed3[:, 1]
comparison_summary_topic_df = data_summary_topic_orig[['category_topic', 'imputed_category_topic']]

In [144]:
# test
new_df = comparison_summary_topic_df.iloc[idx]
new_df[(new_df['category_topic'] != new_df['imputed_category_topic'])]
# 185574 중 오답 8948 개
# category_topic = 33 인 경우 (소설) 에 특히 예측이 더 안됨 
# 해결방안 
# fiction은 summary_topic 사용해서 category_topic을 예측하자
# fiction이 아닌건 그대로 해도 될듯?

Unnamed: 0,category_topic,imputed_category_topic
32771,33.0,4.0
32777,33.0,195.0
163852,67.0,92.0
98317,33.0,2.0
65550,33.0,73.0
...,...,...
98270,33.0,125.0
131046,33.0,-1.0
98285,33.0,-1.0
131061,33.0,-1.0


## 결론

- 오답
    - 8948 / 185574
    - category_topic = 33 인 경우 (소설) 에 특히 예측이 더 안됨 
- 해결방안 
    - fiction은 summary_topic 사용해서 category_topic을 예측하는 건 어떨까?
    - fiction이 아닌건 그대로 하고, fiction category를 세분화하고 싶음