In [64]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss

In [87]:
users = pd.read_csv('../main_code/data/users.csv')
books = pd.read_csv("../main_code/data/books.csv", encoding='utf-8')
train = pd.read_csv('../main_code/data/train_ratings.csv')
test = pd.read_csv('../main_code/data/test_ratings.csv')
sub = pd.read_csv('../main_code/data/sample_submission.csv')

# books
- title, author, publisher, year 글자에 대한 작은 수정
- category 약 30개의 amazon 분류가 아닌, 아주 큰 3개 + others로 분류
- isbn_country로 3개의 분류만 따르기
- summary 잘라내기

In [88]:
def text_preprocessing_func(text : str ) -> str :
    """
    깨진 문자를 변환하는 함수
    """
    text = text.replace('Ã?Â©','e') # 원래는 é인데 걍 e로 메움
    text = text.replace('Ã©', 'e')
    text = text.replace('Ã?Â?','e') # 원래는 é인데 걍 e로 메움
    text = text.lower()
    return text

In [89]:
books['year_of_publication'] = books['year_of_publication'].astype(int) # 출판년도를 정수로

In [90]:
books = books.drop('summary', axis=1) # summary 삭제

In [91]:
# 카테고리 작업
books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip()) # 일단 category에서 대괄호 밖으로 빼기
books['category'] = books['category'].str.lower() # 소문자로 바꾸기

In [92]:
books['category_high'] = books['category'].copy() # category_high로 category를 복사

In [93]:
books['category_high'] = books['category_high'].fillna('Unclassified') # 안 채워진 건 미분류 항목으로 넣기

In [94]:
groupings = {'Fiction': ['fiction', 'ficti'], # 너무 넓으니 맨 위로 빼자
             'Literature & Poem': ['liter', 'poem', 'poetry', 'novel', 'sonnet'],
             'Science & Math': ['science', 'math', 'logy', 'chemis', 'physics', 'electron'], # science, logy 범위가 너무 넓으니 맨 위로
             'Parenting & Relationships': ['baby', 'babies', 'parent', 'family', 'friend', 'tionship', 'brother', 'sister', 'families', 'friendship', 'mother', 'father'], # 좀 큼
             'Medical Books': ['medi', 'psycho'], # psy의 세분화 가능
             'Adventure' : ['adventu'],
             'Animal & Nature': ['animal', 'ecolo', 'plant', 'nature', 'cat', 'dog', 'pets', 'bird', 'bear', 'horse', 'frog', 'duck', 'rabbit', 'dolphin', 'mice', 'deer',
             'panda', 'kangaroo', 'lizzard', 'gorilla', 'chimpangee', 'bat', 'insect'],
             'Arts & Photography': ['art', 'photo', 'drawing', 'picture'], # art는 겹치는 글자가 너무 많음
             'Authors' : ['authors'],
             'Biographies & Memoirs': ['biog', 'memo'],
             'Business & Money': ['busi', 'money', 'econo', 'finance'],
             'Calendars': ['calen'],
             'Children\'s Books': ['child', 'baby'],
             'Christian Books & Bibles': ['christi', 'bible'], #크리스마스때매
             'Christmas' : ['christma'],
             'Comics & Graphic Novels': ['comics', 'graphic novel'],
             'Computers & Technology': ['computer', 'techno', 'archi'],
             'Cookbooks, Food & Wine': ['cook', 'wine', 'food'],
             'Countries & Cities' : ['united states', 'russia', 'france', 'africa', 'china', 'japan', 'egypt', 'germany', 'ireland', 'california', 'berline', 'london', 'new york', 'canada',
             'chile', 'italy', 'europe', 'australia', 'great britain', 'arizona', 'chicago', 'netherlands', 'calif', 'mexico', 'colombia', 'greece',
             'florida', 'algeria', 'new zealand', 'austria', 'denmark', 'washington', 'india', 'england', 'brazil'],
             'Crafts, Hobbies & Home': ['crafts', 'hobb', 'home', 'house', 'garden'],
             'Crime & Murder' : ['crime', 'murder', 'criminal', 'homicide', 'mafia', 'gang', 'drug'],
             'Critic' : ['critic'],
             'Education & Teaching': ['educa', 'teach'],
             'Drama' : ['drama'],
             'Design' : ['design'],
             'Engineering & Transportation': ['engine', 'transp'],
             'Encyclopedia & Dictionary' : ['encyclo', 'dictiona', 'vocabulary'],
             'Essay' : ['essay'],
             'Health, Fitness & Dieting': ['health', 'fitness', 'diet'],
             'History': ['histo', 'war'],
             'Humor & Entertainment': ['humor', 'entertai', 'comed', 'game', 'comic'],
             'Law': ['law'],
             'Language' : ['language'],
             'LGBTQ+ Books': ['lesbian', 'gay', 'bisex'],
             'Mystery, Thriller & Suspense': ['myste', 'thril', 'suspen', 'horror', 'occult'],
             'Music & Dance' : ['music', 'dance', 'instrument', 'ballet', 'classic'],
             'Movie' : ['motion pictur', 'actor', 'actres', 'acting', 'cinema', 'theater', 'director', 'television'],
             'Politics': ['politic', 'president'],
             'Philosophy' : ['philoso'],
             'Reference': ['reference'],
             'Religion & Spirituality': ['religi', 'buddh', 'spirit', 'god', 'prayer', 'belief', 'doubt'],
             'Romance': ['romance'],
             'Science Fiction & Fantasy': ['imagin', 'science fiction', 'fantasy', 'fairy', 'fairies', 'vampire', 'epidemic', 'ghost', 'alien', 'supernatural', 'magic', 'dragons', 'elves', 'angel', 'devil'],
             'Short story' : ['short'],
             'Social Science' : ['social', 'ethic', 'communism', 'capitalism', 'generation', 'culture'],
             'Self-Help': ['self'], # self 검색시 모두 자기계발 관련
             'Study': ['test', 'school', 'examina', 'study aids', 'college'],
             'Sports & Outdoors': ['exerc','sport','outdoor', 'baseball', 'soccer', 'hockey', 'cricket', 'basketball', 'footbal'],
             'Teen & Young Adult': ['teen', 'adol', 'juven'], #nonfiction이란 말은 청소년 관련뿐
             'Travel': ['travel'],
             'Women' : ['women'],
             }

In [95]:
for new_group, small in groupings.items(): # 파편화된 카테고리를 새 그룹으로 묶어주기
    for s in small:
        books.loc[books[books['category'].str.contains(s, na = False)].index, 'category_high'] = new_group

In [96]:
# 10개 이하인 것들 다 미분류에 넣어보기
books_count = books.groupby('category_high').count()['isbn'].to_dict() # category_high별 isbn 수?
for i in range(len(books)):
    books.at[i, 'count'] = books_count[books['category_high'][i]] # books에 count 항목이 생김 # 미분류가 68851개

for i in range(len(books)): # 5033개의 항목을 미분류로 편입
    if books.at[i, 'count'] < 10:
        books.at[i, 'category_high'] = 'Unclassified'

In [97]:
books['category_high'].value_counts().tail(10)

readers                 11
alphabet                11
cults                   11
dreams                  10
beauty personal         10
intelligence service    10
celebrities             10
herbs                   10
courage                 10
bedtime                 10
Name: category_high, dtype: int64

# 기존 상위 10개의 카테고리는 다음과 같음
- Unclassified               68851 (카테고리가 빈 것들이 절대 다수)
- fiction                    33016
- juvenile fiction            5835
- biography autobiography     3326
- history                     1927
- religion                    1818
- juvenile nonfiction         1418
- social science              1231
- humor                       1161
- body mind spirit            1113
# grouping 이후 상위 10개의 카테고리는 다음과 같음 (일단 카테고리는 내 방식대로 두고 진행하겠음)
- Unclassified                 68851
- Fiction                      33842
- Teen & Young Adult            7351
- Biographies & Memoirs         3368
- Religion & Spirituality       3009
- History                       2058
- Humor & Entertainment         2053
- Animal & Nature               1403
- Parenting & Relationships     1388
- Arts & Photography            1368
# 10개 미만 카테고리를 다 미분류에 넣으면 다음과 같음
- Unclassified                 72151 
- Fiction                      33842
- Teen & Young Adult            7351
- Biographies & Memoirs         3368
- Religion & Spirituality       3009
- History                       2058
- Humor & Entertainment         2053
- Animal & Nature               1403
- Parenting & Relationships     1388
- Arts & Photography            1368

In [98]:
books['book_title'] = books['book_title'].apply(text_preprocessing_func) # 제목 깨진거 수정
books['publisher'] = books['publisher'].apply(text_preprocessing_func) # 출판사 깨진거 수정
books['book_author'] = books['book_author'].apply(text_preprocessing_func) # 작가 깨진거 수정

In [99]:
# 출판사에 대해 올바르게 수정하기
publisher_dict=(books['publisher'].value_counts()).to_dict() # 그리고 숫자 세기
publisher_count_df= pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count']) # 출판사별 숫자?

publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False) # count 수대로 정렬

modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values # 1개 이상 있는 출판사의 경우
for publisher in modify_list:
    try:
        number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
        right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
        books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
    except: 
        pass

In [100]:
# isbn으로 country를 채우자
isbn_dict = {} # isbn으로 language를 채우자
isbn_dict = { books['language'][idx] : [isbn[:3]] if books['language'][idx] not in isbn_dict.keys() else isbn_dict[books['language'][idx]].append(isbn[:2]) for idx, isbn in enumerate(books['isbn'])}
isbn_code = {'0' : 'english', '1' : 'english', '2': 'franch', '3' : 'german', '4' : 'japan', '5' : 'russia', '7' : 'china',
             '65' : 'brazil', '80' : 'czecho', '81' : 'india', '82' : 'norway', '83' : 'poland', '84' : 'espanol', '85' : 'brazil', '86' : 'yugoslavia', '87' : 'danish', '88' : 'italy', '89' : 'korean', '90' : 'netherlands', '91' : 'sweden',
            '92' : 'international ngo', '93' : 'inida', '94' : 'netherlands', '600' : 'iran', '601' : 'kazakhstan', '602' : 'indonesia', '603' : 'saudi arabia', '604' : 'vietnam', '605' : 'turkey',
            '606' : 'romania', '607' : 'mexico', '608' : 'north macedonia', '609' : 'lithuania', '611' : 'thailand', '612' : 'peru', '613' : 'mauritius',
            '614' : 'lebanon', '615' : 'hungary', '616' : 'thailand', '617' : 'ukraine', '618' : 'greece', '619' : 'bulgaria', '620' : 'mauritius', '621' : 'phillippines',
            '622' : 'iran', '623' : 'indonesia', '624' : 'sri lanka', '625' : 'turkey', '626' : 'taiwan', '627' : 'pakistan', '628' : 'colombia', '629' : 'malaysia', '630' : 'romania',
            '950' : 'argentina', '951' : 'finland', '952' : 'finland', '953' : 'croatia', '954' : 'bulgaria', '955' : 'sri lanka',
            '956' : 'chile', '957' : 'taiwan', '958' : 'colombia', '959' : 'cuba', '960' : 'greece' , '961' : 'slovenia', '962' : 'hong kong',
            '963' : 'hungary', '964' : 'iran', '965' : 'israel', '966' : 'urkaine', '967' : 'malaysia', '968' : 'mexico', '969' : 'pakistan', '970' : 'mexico',
            '971' : 'phillippines', '972' : 'portugal', '973' : 'romania', '974' : 'thailand', '975' : 'turkey', '976' : 'caribbean community', '977' : 'egypt', '978' : 'nigeria', 
            '979' : 'indonesia', '980' : 'venezuela', '981' : 'singapore', '982' : 'south pacific', '983' : 'malaysia', '984' : 'bangladesh', '985' : 'velarus', '986' : 'taiwan',
            '987' : 'argentina', '988' : 'hong kong', '989' : 'portugal',
            '9960':'saudi arabia', '9963' : 'cyprus', '9968' : 'costa rica', '9971' : 'singapore', '9972' : 'peru', '9974' : 'uruguay',
            '9976' : 'tanzania', '9977' : 'costa rica', '9979' : 'iceland', '9986' : 'lithuania',
            '99903' : 'mauritius', '99905' : ' bolivia', '99909' : 'malta', '99912' : 'botswana', '99920' : 'andorra', '99928' : 'georgia',
            '99935' : 'haiti', '99936' : 'bhutan', '99942' : 'armenia', '99943' : 'albania', '99974' : 'bolivia',
            '99975' : 'mongolia', '99989' : 'paraguay'}
check_list = []
books['isbn_country'] = 'na'
for idx in range(len(books)):
    isbn = books['isbn'][idx][:5]
    if isbn[0] in isbn_code.keys():
        books.at[idx, 'isbn_country'] = isbn_code[isbn[0]]
    elif isbn[:2] in isbn_code.keys():
        books.at[idx, 'isbn_country'] = isbn_code[isbn[0:2]]
    elif isbn[:3] in isbn_code.keys():
        books.at[idx, 'isbn_country'] = isbn_code[isbn[0:3]]
    elif isbn[:4] in isbn_code.keys():
        books.at[idx, 'isbn_country'] = isbn_code[isbn[:4]]
    elif isbn[:] in isbn_code.keys():
        books.at[idx, 'isbn_country'] = isbn_code[isbn[:]]
    else:
        check_list.append(isbn)

books[books['isbn_country'] == 'na']['isbn_country'] = 'english' # nan이면 en으로 채움

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books[books['isbn_country'] == 'na']['isbn_country'] = 'english' # nan이면 en으로 채움


In [102]:
books = books.drop(['language'], axis = 1)
books = books.drop(['count'], axis = 1) # 필요없는 칼럼 삭제
books = books.drop(['category'], axis = 1) # 필요없는 칼럼 삭제

In [103]:
# 이제 isbn_country를 크게 묶어보자
books['isbn_country'].value_counts().head(10)

english        134405
german           6706
franch           3405
espanol          3399
italy             482
argentina         242
netherlands       176
portugal          106
mexico             95
japan              80
Name: isbn_country, dtype: int64

# isbn country 상위 10개는 다음과 같음
- english        134405 (미국 캐나다 영국 호주)
- german           6706 (독일)
- franch           3405 (프랑스) - 프랑스어 책이 유저 나라에 비해 겁나 많음 이러면 유저도 프랑스 사람 분류해야 함-
- espanol          3399 (스페인) - 여기까지 나누자, 그러면 이에 맞게 유저의 나라도 분류하는 게 맞음-
- italy             482 (이탈리아)
- argentina         242
- netherlands       176
- portugal          106
- mexico             95
- japan              80

In [104]:
books_language = ['english', 'german', 'franch', 'espanol']
for i in range(len(books)):
    if books.at[i, 'isbn_country'] not in books_language:
        books.at[i, 'isbn_country'] = 'others'

In [105]:
books['isbn_country'].value_counts().head(10)

english    134405
german       6706
franch       3405
espanol      3399
others       1655
Name: isbn_country, dtype: int64

In [126]:
books.isna().sum()

isbn                   0
book_title             0
book_author            0
year_of_publication    0
publisher              0
img_url                0
img_path               0
category_high          0
isbn_country           0
dtype: int64

In [127]:
books

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,img_path,category_high,isbn_country
0,0002005018,clara callan,richard bruce wright,2001,harpercollins,http://images.amazon.com/images/P/0002005018.0...,images/0002005018.01.THUMBZZZ.jpg,Movie,english
1,0060973129,decision in normandy,carlo d'este,1991,harpercollins,http://images.amazon.com/images/P/0060973129.0...,images/0060973129.01.THUMBZZZ.jpg,Unclassified,english
2,0374157065,flu: the story of the great influenza pandemic...,gina bari kolata,1999,farrar straus giroux,http://images.amazon.com/images/P/0374157065.0...,images/0374157065.01.THUMBZZZ.jpg,Medical Books,english
3,0399135782,the kitchen god's wife,amy tan,1991,putnam pub group,http://images.amazon.com/images/P/0399135782.0...,images/0399135782.01.THUMBZZZ.jpg,Fiction,english
4,0425176428,what if?: the world's foremost military histor...,robert cowley,2000,berkley publishing group,http://images.amazon.com/images/P/0425176428.0...,images/0425176428.01.THUMBZZZ.jpg,History,english
...,...,...,...,...,...,...,...,...,...
149565,067161746X,the bachelor home companion: a practical guide...,p.j. o'rourke,1987,pocket,http://images.amazon.com/images/P/067161746X.0...,images/067161746X.01.THUMBZZZ.jpg,Humor & Entertainment,english
149566,0767907566,all elevations unknown: an adventure in the he...,sam lightner,2001,broadway books,http://images.amazon.com/images/P/0767907566.0...,images/0767907566.01.THUMBZZZ.jpg,Animal & Nature,english
149567,0884159221,why stop?: a guide to texas historical roadsid...,claude dooley,1985,bridge publications,http://images.amazon.com/images/P/0884159221.0...,images/0884159221.01.THUMBZZZ.jpg,Unclassified,english
149568,0912333022,the are you being served? stories: 'camping in...,jeremy lloyd,1997,pub group west,http://images.amazon.com/images/P/0912333022.0...,images/0912333022.01.THUMBZZZ.jpg,Fiction,english


In [128]:
books.to_csv('./data/books_1102.csv', index=False)

# users
- age 자르기
- location 분리하기 (city, state, country)
- **user의 country를 미국/캐나다/독일/프랑스로 나눠보자 (세부 조정 필요)**

In [108]:
users # 68092 x 3

Unnamed: 0,user_id,location,age
0,8,"timmins, ontario, canada",
1,11400,"ottawa, ontario, canada",49.0
2,11676,"n/a, n/a, n/a",
3,67544,"toronto, ontario, canada",30.0
4,85526,"victoria, british columbia, canada",36.0
...,...,...,...
68087,278376,"danville, pennsylvania, usa",54.0
68088,278621,"victoria, delaware, canada",74.0
68089,278636,"irvington, alabama, usa",
68090,278659,"vancouver, washington, usa",33.0


In [109]:
def age_map(x: int) -> int: # age map 하지만 age 자를 거면 안 씀
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [110]:
users = users.drop('age', axis=1) # 1. age 자르기
users

Unnamed: 0,user_id,location
0,8,"timmins, ontario, canada"
1,11400,"ottawa, ontario, canada"
2,11676,"n/a, n/a, n/a"
3,67544,"toronto, ontario, canada"
4,85526,"victoria, british columbia, canada"
...,...,...
68087,278376,"danville, pennsylvania, usa"
68088,278621,"victoria, delaware, canada"
68089,278636,"irvington, alabama, usa"
68090,278659,"vancouver, washington, usa"


In [111]:
#2. location 분리하기 (city, state, country)
users['location'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # location에서 특문 삭제

# split 하여 칼럼 할당
users['location_city'] = users['location'].apply(lambda x: x.split(',')[0].strip())
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1].strip())
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2].strip())

users = users.replace('na', np.nan) #특수문자 제거로 n/a가 na로 바뀌게 되었습니다. 따라서 이를 컴퓨터가 인식할 수 있는 결측값으로 변환합니다.
users = users.replace('', np.nan) # 일부 경우 , , ,으로 입력된 경우가 있었으므로 이런 경우에도 결측값으로 변환합니다.
users

  users['location'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # location에서 특문 삭제


Unnamed: 0,user_id,location,location_city,location_state,location_country
0,8,"timmins,ontario,canada",timmins,ontario,canada
1,11400,"ottawa,ontario,canada",ottawa,ontario,canada
2,11676,"na,na,na",,,
3,67544,"toronto,ontario,canada",toronto,ontario,canada
4,85526,"victoria,britishcolumbia,canada",victoria,britishcolumbia,canada
...,...,...,...,...,...
68087,278376,"danville,pennsylvania,usa",danville,pennsylvania,usa
68088,278621,"victoria,delaware,canada",victoria,delaware,canada
68089,278636,"irvington,alabama,usa",irvington,alabama,usa
68090,278659,"vancouver,washington,usa",vancouver,washington,usa


In [112]:
# 도시로 나라를 채워보자!
modify_location = users[(users['location_country'].isna())&(users['location_city'].notnull())]['location_city'].values # 나라는 없고 도시만 있는 도시 목록 -> 얘들 나라를 채워주자

location_list = [] # 제대로 된 location을 모아 둠 이거 보고 위에 애들 나라 채워줄 것
for location in modify_location:
    try:
        right_location = users[(users['location'].str.contains(location))&(users['location_country'].notnull())]['location'].value_counts().index[0]
        location_list.append(right_location)
    except:
        pass
    
for location in location_list: # 위 리스트로 채워주기
    users.loc[users[users['location_city']==location.split(',')[0]].index,'location_state'] = location.split(',')[1] # 주 채워주기
    users.loc[users[users['location_city']==location.split(',')[0]].index,'location_country'] = location.split(',')[2] # 나라 채워주기

In [113]:
users.isnull().sum() # city, state는 자를 거라서 없어도 괜찮은데 나라는 일단 others로 채워주자

user_id                0
location               0
location_city        122
location_state      1132
location_country     271
dtype: int64

## 의문사항
- state는 country에 종속적이라 볼 수 있는데, 살리는 게 맞나?
- 만약 살린다면, 상위 3개를 끊는 것이 의미가 있나? 분명 미국 또는 캐나다의 state일텐데?
- 그럼 상위 나라별 state 3개를 써야 하나? (그럼 9개)

## 일단 잘라!

In [114]:
# 일단 state 삭제
users = users.drop('location_city', axis=1)
users = users.drop('location_state', axis=1)
users

Unnamed: 0,user_id,location,location_country
0,8,"timmins,ontario,canada",canada
1,11400,"ottawa,ontario,canada",canada
2,11676,"na,na,na",
3,67544,"toronto,ontario,canada",canada
4,85526,"victoria,britishcolumbia,canada",canada
...,...,...,...
68087,278376,"danville,pennsylvania,usa",usa
68088,278621,"victoria,delaware,canada",canada
68089,278636,"irvington,alabama,usa",usa
68090,278659,"vancouver,washington,usa",canada


In [115]:
users = users.drop('location', axis=1) # location도 마저 잘라줌
users

Unnamed: 0,user_id,location_country
0,8,canada
1,11400,canada
2,11676,
3,67544,canada
4,85526,canada
...,...,...
68087,278376,usa
68088,278621,canada
68089,278636,usa
68090,278659,canada


In [116]:
users['location_country'] = users['location_country'].fillna('anycountry') # 일단 빈 것은 others로 채움
users

Unnamed: 0,user_id,location_country
0,8,canada
1,11400,canada
2,11676,anycountry
3,67544,canada
4,85526,canada
...,...,...
68087,278376,usa
68088,278621,canada
68089,278636,usa
68090,278659,canada


In [117]:
# country 잘못된 애들 교정
country_fix_dict = {'usa': {'oklahoma','districtofcolumbia', 'connecticut', 'worcester', 'aroostook', 'texas',  'kern', 'orangeco', 'unitedstatesofamerica', 'fortbend', 'alachua', 'massachusetts', 'arizona', 'austin', 'hawaii', 'ohio', 'camden', 'arkansas', 'minnesota', 'losestadosunidosdenorteamerica', 'us', 'usanow', 'northcarolina', 'maine', 'colorado','oklahoma', 'alabama', 'anystate', 'districtofcolumbia', 'unitedstaes', 'pender', 'newhampshire', 'unitedstates', 'missouri', 'idaho', 'ca', 'newyork','tennessee', 'stthomasi', 'dc', 'washington', 'illinois', 'california', 'michigan', 'iowa', 'maryland', 'newjersey', 'vanwert', 'oregon'},
                    'uk': {'alderney', 'wales',  'aberdeenshire', 'bermuda', 'nottinghamshire', 'scotland', 'usacurrentlylivinginengland', 'england', 'countycork', 'alderney', 'cambridgeshire', 'middlesex', 'northyorkshire', 'westyorkshire', 'cocarlow', 'sthelena'},
                    'japan': {'okinawa'},
                    'southkorea': {'seoul'},
                    'canada': {'ontario', 'alberta', 'novascotia', 'newfoundland', 'newbrunswick', 'britishcolumbia'},
                    'miyanma': {'burma'},
                    'newzealand': {'auckland', 'nz', 'otago'},
                    'spain': {'andalucia','pontevedra', 'gipuzkoa', 'lleida', 'catalunyaspain', 'galiza', 'espaa'},
                    'germany': {'niedersachsen', 'deutschland'},
                    'brazil': {'disritofederal'},
                    'switzerland': {'lasuisse'},
                    'italy': {'veneziagiulia', 'ferrara', 'italia'},
                    'australia': {'nsw', 'queensland', 'newsouthwales'},
                    'belgium': {'labelgique', 'bergued'},
                    'uruguay': {'urugua'},
                    'panama': {'republicofpanama'}
                   }
country_del_list = ['c', 'space', 'universe', 'unknown', 'quit', 'tdzimi', 'universe', 'tn', 'unknown', 'space', 'c', 'franciscomorazan', 'petrolwarnation', 'ineurope', 'hereandthere', 'faraway'] 

In [118]:
# 교정 및 이상한 나라를 가진 애들 모음
del_idx = []
for idx, row in enumerate(users['location_country']):
    for key, value in country_fix_dict.items():
        if row in value:
            users.at[idx, 'location_country'] = key
    if row in country_del_list:
        del_idx.append(idx)

In [119]:
# 이상한 나라를 가진 애들도 others에 넣기
for i in del_idx:
    users.at[i, 'location_country'] = 'anycountry'

In [120]:
users['location_country'].value_counts().head(10)

usa              45301
canada            6538
germany           3609
unitedkingdom     3148
australia         1821
spain             1692
italy              830
france             829
newzealand         462
switzerland        459
Name: location_country, dtype: int64

# 상위 10개의 location_country는 아래와 같음
- usa              45301
- canada            6538
- germany           3609
- unitedkingdom     3148
- australia         1821
- spain             1692
- italy              830 -이탈리아 사람이 프랑스 사람보다 많긴 하지만 책에서 이탈리아 뺐기 때문에 분류하지 않음-
- france             829 - 프랑스 사람까지 분류-
- newzealand         462
- switzerland        459
- 참고사항
    - others는 원래 비어있었거나, 이상하게 입력된 것으로 기존 285개
    - unitedkingdom부터 others로 편입시킬 것임

In [121]:
users_country = ['usa', 'canada', 'germany', 'unitedkingdom', 'australia', 'spain', 'france'] # italy 빠짐
for i in range(len(users)):
    if users.at[i, 'location_country'] not in users_country:
        users.at[i, 'location_country'] = 'anycountry'

In [123]:
users['location_country'].value_counts().head(10) # anycountry가 5154개로 3위에 등극!

usa              45301
canada            6538
anycountry        5154
germany           3609
unitedkingdom     3148
australia         1821
spain             1692
france             829
Name: location_country, dtype: int64

In [125]:
users.isna().sum()

user_id             0
location_country    0
dtype: int64

In [129]:
users.to_csv('./data/users_1102.csv', index=False)

In [131]:
trains = pd.read_csv('../main_code/data/train_ratings.csv')

In [145]:
trains[trains['user_id'] == 176]

Unnamed: 0,user_id,isbn,rating


In [147]:
train['rating'].mean()

0    8
Name: rating, dtype: int64

In [156]:
trains[trains['isbn'] == '0785268839'].mode(0)

Unnamed: 0,user_id,isbn,rating
0,104429,785268839.0,10.0
1,117583,,
2,118705,,
3,140036,,
4,258556,,
