In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys

from tqdm import tqdm

import re

from PIL import Image
import requests

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = "/opt/ml/data/"

In [3]:
users = pd.read_csv(data_path + 'users.csv')
books = pd.read_csv(data_path + 'books.csv')
train = pd.read_csv(data_path + 'train_ratings.csv')
test = pd.read_csv(data_path + 'test_ratings.csv')

In [4]:
print('users shape: ', users.shape)
print('books shape: ', books.shape)
print('ratings shape: ', train.shape)

users shape:  (68092, 3)
books shape:  (149570, 10)
ratings shape:  (306795, 3)


In [5]:
def replace_na(unique:np.array):
    unique = unique.tolist()
    unique.remove('na')
    unique = ['na'] + unique
    return unique

def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6
    
ratings1 = train.copy() ; ratings2 = test.copy()

users['location'] = users['location'].str.replace(r'[^0-9a-zA-Z:,]', '') # 특수문자 제거
users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
users = users.replace('na', np.nan) #특수문자 제거로 n/a가 na로 바뀌게 되었습니다. 따라서 이를 컴퓨터가 인식할 수 있는 결측값으로 변환합니다.
users = users.replace('', np.nan) # 일부 경우 , , ,으로 입력된 경우가 있었으므로 이런 경우에도 결측값으로 변환합니다.

# city는 있는데 country 없는 경우 채우기
modify_location = users[(users['location_country'].isna())&(users['location_city'].notnull())]['location_city'].values

location_list = []
for location in tqdm(modify_location, desc='(1/4) fill country'):
    try:
        right_location = users[(users['location'].str.contains(location))&(users['location_country'].notnull())]['location'].value_counts().index[0]
        location_list.append(right_location)
    except:
        pass

for location in tqdm(location_list, desc='(2/4) fill city'):
    users.loc[users[users['location_city']==location.split(',')[0]].index,'location_state'] = location.split(',')[1]
    users.loc[users[users['location_city']==location.split(',')[0]].index,'location_country'] = location.split(',')[2]


# book preprocessing

# 유명 출판사 표기 오류로 그룹화되지 못하는 케이스 처리
publisher_dict=(books['publisher'].value_counts()).to_dict()
publisher_count_df = pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])
publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False)

modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values

for publisher in tqdm(modify_list, desc = '(3/4) grouping same publisher'):
    try:
        number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
        right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
        books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
    except: 
        pass

# category 대괄호 제거 및 소문자 변환
books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
books['category'] = books['category'].str.lower()

# 43개의 high-category로 묶기
categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
                'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
                'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
                'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']

for category in tqdm(categories, desc = '(4/4) : high-categorizing'):
    books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category

# 10개 이하 항목 others로 묶기
category_high_df = pd.DataFrame(books['category_high'].value_counts()).reset_index()
category_high_df.columns = ['category','count']
others_list = category_high_df[category_high_df['count']<10]['category'].values
books.loc[books[books['category_high'].isin(others_list)].index, 'category_high']='others'

# year_of_publication 변수 전처리
books.loc[104259, 'year_of_publication'] = 2010.0
books.loc[121860, 'year_of_publication'] = 1997.0
books = books.drop(np.where(books['year_of_publication'] < 1900)[0][0]).reset_index(drop=True)

# location은 이제 필요 없음
users = users.drop(['location'], axis=1)
print('-'*20, 'Mission1 EDA Done', '-'*20)

ratings = pd.concat([ratings1, ratings2]).reset_index(drop=True)

# 인덱싱 처리된 데이터 조인
context_df = ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'category_high', 'publisher', 'language', 'book_author', 'year_of_publication']], on='isbn', how='left')
train_df = ratings1.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'category_high', 'publisher', 'language', 'book_author', 'year_of_publication']], on='isbn', how='left')
test_df = ratings2.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'category_high', 'publisher', 'language', 'book_author', 'year_of_publication']], on='isbn', how='left')

train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
train_df['age'] = train_df['age'].apply(age_map)
test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
test_df['age'] = test_df['age'].apply(age_map)

(1/4) fill country: 100%|██████████| 2113/2113 [00:56<00:00, 37.43it/s]
(2/4) fill city: 100%|██████████| 1962/1962 [00:18<00:00, 103.83it/s]
(3/4) grouping same publisher: 100%|██████████| 5276/5276 [02:29<00:00, 35.19it/s]
(4/4) : high-categorizing: 100%|██████████| 43/43 [00:02<00:00, 19.53it/s]


-------------------- Mission1 EDA Done --------------------


In [6]:
train_df_copy = train_df.copy() ; test_df_copy = test_df.copy() ; context_df_copy = context_df.copy()

In [7]:
rating1_user = train_df_copy[train_df_copy['rating']==1]['user_id'].unique()

bot1_users = []

for user in tqdm(rating1_user):
    if (train_df[train_df['user_id']==user]['rating'].nunique()==1) & (len(train_df[train_df['user_id']==user]['rating']) > 2):
        bot1_users.append(user)

100%|██████████| 7616/7616 [00:08<00:00, 919.30it/s]


In [8]:
len(bot1_users)

11

In [9]:
bot_df = train_df[train_df['user_id'].isin(bot1_users)]

In [10]:
# bot_df = books.merge(bot_df, on='isbn', how='left').merge(bot_df[['user_id', 'isbn', 'publisher', 'language', 'book_author', 'year_of_publication']], on='isbn', how='left')
bot_df = bot_df.merge(books[['isbn', 'book_title']], on='isbn', how='left')[['user_id', 'language', 'book_author', 'book_title']]

In [11]:
bot_df.groupby('user_id')['book_title'].value_counts()

user_id  book_title                                                                              
9083     Hannibal                                                                                    1
         The Brethren                                                                                1
         The Talisman                                                                                1
12392    Escape Via Siberia: A Jewish Child's Odyssey of Survival                                    1
         Pigs in Heaven                                                                              1
         Tom Clancy's Op-Center: Games of State (Tom Clancy's Op Center (Paperback))                 1
95511    Bless Me, Ultima                                                                            1
         GOLEM 100                                                                                   1
         L. Ron Hubbard Presents Writers of the Future Vol. 16                

모두 각각 다른책에 전부 평점을 1점을 주었음 -> 이 친구들은 그냥 다 1점을 주는 사람들이라고 생각해보자

In [12]:
list(set(test_df['user_id'].unique()) & set(bot1_users))

[12392, 256618, 116746, 95511]

In [14]:
sota_2_1291 = pd.read_csv('20230416_004712_catboost.csv')

In [15]:
sota_2_1291_bot_replace = sota_2_1291.copy()
sota_2_1291_bot_replace.loc[sota_2_1291_bot_replace['user_id'].isin(bot1_users), 'rating'] = 1

In [16]:
sota_2_1291_bot_replace[sota_2_1291_bot_replace['user_id'].isin(bot1_users)]['rating']

14942    1.0
24678    1.0
37674    1.0
48901    1.0
63508    1.0
Name: rating, dtype: float64

In [17]:
sota_2_1291_bot_replace_ensemble = sota_2_1291.copy()
sota_2_1291_bot_replace_ensemble['rating'] = (sota_2_1291_bot_replace['rating'] + sota_2_1291['rating'])/2

In [18]:
sota_2_1291_bot_replace

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,7.285003
1,116866,0002005018,7.808521
2,152827,0060973129,7.523333
3,157969,0374157065,7.733604
4,67958,0399135782,7.376109
...,...,...,...
76694,278543,1576734218,5.394856
76695,278563,3492223710,6.552265
76696,278633,1896095186,6.249670
76697,278668,8408044079,4.979711


In [19]:
sota_2_1291_bot_replace_ensemble[sota_2_1291_bot_replace_ensemble['user_id'].isin(bot1_users)]['rating']

14942    3.216642
24678    3.333268
37674    3.099679
48901    3.782704
63508    3.033942
Name: rating, dtype: float64

In [101]:
sota_2_1291_bot_replace.to_csv('sota_2_1291_bot_replace.csv')
sota_2_1291_bot_replace_ensemble.to_csv('sota_2_1291_bot_replace_ensemble.csv')