In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, linalg
# !pip install xgboost
from xgboost import XGBRegressor, XGBClassifier
# !pip install lightgbm
from lightgbm import LGBMRegressor, LGBMClassifier
# !pip install catboost
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings(action='ignore')
import re

In [2]:
books = pd.read_csv("data/books.csv")
train = pd.read_csv("data/train_ratings.csv")
test = pd.read_csv("data/test_ratings.csv")
users = pd.read_csv("data/users.csv")
sub = pd.read_csv("data/sample_submission.csv")

In [3]:
seed=42

In [4]:
def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

In [5]:
users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
users = users.drop(['location'], axis=1)

######################### location 전처리
users['location_city'] = users['location_city'].str.strip()
users['location_state'] = users['location_state'].str.strip()
users['location_country'] = users['location_country'].str.strip()
users['location_city'] = users['location_city'].str.replace(r'[^a-zA-Z]', '', regex=True)
users['location_state'] = users['location_state'].str.replace(r'[^a-zA-Z]', '', regex=True)
users['location_country'] = users['location_country'].str.replace(r'[^a-zA-Z]', '', regex=True)
'''
location_country
'''
# null & na & universe & etc
null_repl = [
    'universe', 'na', '', 'lava', 'petrolwarnation', 'space', 'lachineternelle',
    'faraway', 'everywhereandanywhere', 'hereandthere', 'tdzimi', 'naontheroad',
    'unknown'
]
for keyword in null_repl:
    users.loc[users['location_country'] == keyword, 'location_country'] = 'null'
users.loc[users['location_country'] == 'c', 'location_country'] = 'null'
# australia
australia_repl = [
    'newsouthwales', 'queensland', 'tasmania', 'victoria', 'nsw', 'southaustralia'
]
for keyword in australia_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'australia'
# italy
users.loc[users['location_country'].str.contains('ital'), 'location_country'] = 'italy'
users.loc[users['location_country'].str.contains('ferrara'), 'location_country'] = 'italy'
users.loc[users['location_country'].str.contains('veneziagiulia'), 'location_country'] = 'italy'
users.loc[users['location_country'].str.contains('ineurope'), 'location_country'] = 'italy'
# germany
users.loc[users['location_country'].str.contains('deut'), 'location_country'] = 'germany'
users.loc[users['location_country'].str.contains('germ'), 'location_country'] = 'germany'
users.loc[users['location_country'].str.contains('berlin'), 'location_country'] = 'germany'
users.loc[users['location_country'].str.contains('niedersachsen'), 'location_country'] = 'germany'
# united kingdom
uk_repls = [
    'unitedkingdom', 'eng', 'king', 'wales', 'scotland', 'aberdeenshire', 'camden', 'unitedkindgonm',
    'middlesex', 'nottinghamshire', 'westyorkshire', 'cambridgeshire', 'sthelena', 'northyorkshire',
    'obviously'
]
for keyword in uk_repls:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'united kingdom'
users.loc[users['location_country'] == 'uk', 'location_country'] = 'united kingdom'
# ireland
users.loc[users['location_country'].str.contains('countycork'), 'location_country'] = 'ireland'
users.loc[users['location_country'].str.contains('cocarlow'), 'location_country'] = 'ireland'
# france
users.loc[users['location_country'].str.contains('fran'), 'location_country'] = 'france'
users.loc[users['location_country'].str.contains('paris'), 'location_country'] = 'france'
# spain
spain_repl = [
    'esp', 'catal', 'galiza', 'euskalherria', 'lleida', 'gipuzkoa', 'orense', 'pontevedra', 'almera',
    'bergued', 'andalucia'
]
for keyword in spain_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'spain'
# portugal
users.loc[users['location_country'].str.contains('oeiras'), 'location_country'] = 'portugal'
# belgium
users.loc[users['location_country'].str.contains('labelgique'), 'location_country'] = 'belgium'
# austria
users.loc[users['location_country'].str.contains('eu'), 'location_country'] = 'austria'
# swiss
users.loc[users['location_country'].str.contains('lasuisse'), 'location_country'] = 'switzerland'
# finland
users.loc[users['location_country'].str.contains('etelsuomi'), 'location_country'] = 'finland'
# usa
usa_repl = [
    'unitedstaes', 'america', 'usa', 'state', 'sate', 'cali', 'dc', 'oregon', 'texas', 'florida',
    'newhampshire', 'newmexico', 'newjersey', 'newyork', 'virginia', 'bermuda', 'illinois', 'michigan',
    'arizona', 'indiana', 'minnesota', 'tennessee', 'dakota', 'connecticut', 'wisconsin', 'ohio',
    'maryland', 'northcarolina', 'massachusetts', 'colorado', 'washington', 'maine', 'georgia', 'oklahoma',
    'maracopa', 'districtofcolumbia', 'saintloius', 'orangeco', 'aroostook', 'arkansas', 'montana',
    'rhodeisland', 'nevada', 'kern', 'fortbend', 'nebraska', 'usofa', 'alabama', 'csa', 'polk',
    'alachua', 'austin', 'alaska', 'hawaii', 'worcester', 'iowa', 'cherokee', 'shelby', 'stthomasi',
    'vanwert', 'kansas', 'idaho', 'tn', 'framingham', 'pender', 'ysa', 'arizona', 'morgan', 'rutherford'
]
for keyword in usa_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'usa'
users.loc[users['location_country'] == 'us', 'location_country'] = 'usa'
users.loc[users['location_country'] == 'ca', 'location_country'] = 'usa'
users.loc[users['location_country'] == 'il', 'location_country'] = 'usa'
users.loc[users['location_country'] == 'ua', 'location_country'] = 'usa'
# cananda
canada_repl = [
    'cananda', 'british', 'newfoundland', 'newbrunswick', 'alberta', 'ontario', 'lkjlj', 'bc',
    'novascotia', 'kcb', 'quebec', 'maricopa', 'travelling', 'vvh', 'saskatchewan'
]
for keyword in canada_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'canada'
# new zealand
users.loc[users['location_country'] == 'nz', 'location_country'] = 'newzealand'
users.loc[users['location_country'].str.contains('otago'), 'location_country'] = 'newzealand'
users.loc[users['location_country'].str.contains('auckland'), 'location_country'] = 'newzealand'
# malaysia
users.loc[users['location_country'].str.contains('kedah'), 'location_country'] = 'malaysia'
# uae
users.loc[users['location_country'].str.contains('uae'), 'location_country'] = 'unitedarabemirates'
# kuwait
users.loc[users['location_country'].str.contains('quit'), 'location_country'] = 'kuwait'
# phillipines
users.loc[users['location_country'].str.contains('phill'), 'location_country'] = 'philippines'
users.loc[users['location_country'].str.contains('metromanila'), 'location_country'] = 'philippines'
# uruguay
users.loc[users['location_country'].str.contains('urugua'), 'location_country'] = 'uruguay'
# panama
users.loc[users['location_country'].str.contains('republicofpanama'), 'location_country'] = 'panama'
# trinidadandtobago
users.loc[users['location_country'].str.contains('westindies'), 'location_country'] = 'trinidadandtobago'
# guernsey
users.loc[users['location_country'].str.contains('alderney'), 'location_country'] = 'guernsey'
# japan
users.loc[users['location_country'].str.contains('okinawa'), 'location_country'] = 'japan'
# korea
users.loc[users['location_country'].str.contains('seoul'), 'location_country'] = 'southkorea'
# brazil
users.loc[users['location_country'].str.contains('disritofederal'), 'location_country'] = 'brazil'
'''
location_state
'''
# usa
usa_state_repl = [
    'usa', 'texas', 'tx', 'california', 'massachusetts', 'michigan', 'carolina', 'florida', 'colorado', 'pennsylvania',
    'newyork', 'newjersey', 'virginia', 'dc', 'washington', 'iowa', 'illinois', 'georgia', 'kansas', 'missouri',
    'mississippi', 'oregon', 'arizona', 'ohio', 'tennessee', 'idaho', 'alaska', 'alabama', 'minnesota', 'utah',
    'kentucky', 'rhodeisland', 'maryland', 'louisiana', 'indiana', 'connecticut', 'wisconsin', 'newhampshire',
    'nevada', 'oklahoma', 'georgia', 'maine', 'newmexico', 'nebraska', 'wyoming', 'frenchquarter', 'fl', 'nebr', 'ct',

]
for keyword in usa_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'usa'
# canada
canada_state_repl = [
    'britishcolumbia', 'newbrunswick', 'novascotia', 'ontario', 'alberta', 'quebec', 'saskatchewan',
    'manitoba', 
]
for keyword in canada_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'canada'
# mexico
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('jalisco')), 'location_country'] = 'mexico'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('michoacan')), 'location_country'] = 'mexico'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('morelos')), 'location_country'] = 'mexico'
# united kingdom
uk_state_repl = [
    'newhampshire', 'nottinghamshire', 'england', 'middlesex', 'midlothian', 'scotland', 'westyorkshire',
    'canterbury', 'wiltshire', 'kent', 'london', 'cambs', 'herts', 'isleofman', 'surrey', 'cheshire',
    'gloucestershire', 'aberdeenshire'
]
for keyword in uk_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'united kingdom'
# ireland
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('dublin')), 'location_country'] = 'ireland'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('wicklow')), 'location_country'] = 'ireland'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('colimerick')), 'location_country'] = 'ireland'
# australia
australia_state_repl = [
    'newsouthwales', 'victoria', 'australiancapitalterritory', 'southaustralia', 'nsw'
]
for keyword in australia_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'australia'
# germany
germany_state_repl = [
    'nordrheinwestfalen', 'bayern', 'hamburg', 'badenwuerttemberg', 'badenwrttemberg', 'sachsen', 'berlin',
    'stuttgart', 'nrw', 'bavaria', 'bremen'
]
for keyword in germany_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'germany'
# switzerland
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('bern')), 'location_country'] = 'switzerland'
# austria
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('niederoesterreich')), 'location_country'] = 'austria'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('vienna')), 'location_country'] = 'austria'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('wien')), 'location_country'] = 'austria'
# slovenia
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('ljubljanskaregija')), 'location_country'] = 'slovenia'
# spain
spain_state_repl = [
    'catalunya', 'pontevedra', 'madrid', 'bizkaia', 'asturias', 'pontevedra', 'barcelona', 'pasvasco',
    'espaa', 'badajoz', 'gipuzkoa', 'valencia', 'galicia'
]
for keyword in spain_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'spain'
# portugal
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('lisboa')), 'location_country'] = 'portugal'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('coimbra')), 'location_country'] = 'portugal'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('porto')), 'location_country'] = 'portugal'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('estremadura')), 'location_country'] = 'portugal'
# netherlands
netherlands_state_repl = [
    'noordholland', 'utrecht', 'zuidholland', 'overijssel', 'friesland', 'northholland', 'schleswigholstein',
    'zh', 'twente', 
]
for keyword in netherlands_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'netherlands'
# belgium
belgium_state_repl = [
    'vlaamsbrabant', 'liege'
]
for keyword in belgium_state_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains(keyword)), 'location_country'] = 'belgium'
# new zealand
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('auckland')), 'location_country'] = 'newzealand'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('northisland')), 'location_country'] = 'newzealand'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('waikato')), 'location_country'] = 'newzealand'
# italy
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('italia')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('toscana')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('piemonte')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('lombardia')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('gorizia')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_state'] == 're'), 'location_country'] = 'italy'
# greece
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('townofbali')), 'location_country'] = 'greece'
# nigeria
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('imostate')), 'location_country'] = 'nigeria'
# southafrica
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('westerncape')), 'location_country'] = 'southafrica'
# romania
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('ilfov')), 'location_country'] = 'romania'
# malaysia
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('penang')), 'location_country'] = 'malaysia'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('negerisembilan')), 'location_country'] = 'malaysia'
# indonesia
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('jakarta')), 'location_country'] = 'indonesia'
# philippines
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('laguna')), 'location_country'] = 'philippines'
# singapore
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('singapore')), 'location_country'] = 'singapore'
# pakistan
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('punjab')), 'location_country'] = 'pakistan'
# denmark
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('jutland')), 'location_country'] = 'denmark'
# france
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('lorraine')), 'location_country'] = 'france'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('hautegaronne')), 'location_country'] = 'france'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('heraut')), 'location_country'] = 'france'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('rhnealpes')), 'location_country'] = 'france'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('iledefrance')), 'location_country'] = 'france'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('paca')), 'location_country'] = 'france'
# uruguay
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('montevideo')), 'location_country'] = 'uruguay'
# argentina
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('buenosaires')), 'location_country'] = 'argentina'
# peru
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('southamerica')), 'location_country'] = 'peru'
# chile
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('santiago')), 'location_country'] = 'chile'
# japan
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('iwakuni')), 'location_country'] = 'japan'
users.loc[(users['location_country'] == 'null') & (users['location_state'].str.contains('tokyo')), 'location_country'] = 'japan'



'''
location_city
'''
# usa
usa_city_repl = [
    'losang', 'seattle', 'sanf', 'sand', 'newyork', 'newark', 'newbedford', 'portland', 'cincinnati',
    'houston', 'albuquerque', 'chicago', 'austin', 'beaverton', 'raleigh', 'richmond', 'fairbanks',
    'minneapolis', 'stlouis', 'tucson', 'oakland', 'boston', 'kansascity', 'denver', 'springfield',
    'topeka', 'dallas', 'asheville', 'buffalo', 'fremont', 'stpaul', 'elcajon', 'miami', 'marysville',
    'baltimore', 'charleston', 'santamonica', 'knoxville', 'rochester', 'orlando', 'coloradosprings',
    'arlington', 'pensacola', 'sanjose', 'cedarrapids', 'olympia', 'lasvegas', 'mercerisland',
    'encinitas', 'omaha', 'lawrence', 'sacramento', 'norfolk', 'kirkwood', 'tallahassee', 'lexington',
    'kalamazoo', 'orleans', 'desmoines', 'aurora', 'annarbor', 'newbern', 'somerville', 'lakeland',
    'hartford', 'tigard', 'phoenix', 'irvine', 'sanantonio', 'mesa', 'brooklyn', 'philadelphia',
    'lacey', 'greenbay', 'pittsburg', 'wichita', 'elizabeth', 'murrieta', 'batonrouge', 'yuma',
    'baycity', 'lynchburg', 'santabarbara', 'statenisland', 'saintpaul', 'lakewood', 'fallschurch',
    'northhaven', 'frederick', 'milwaukie', 'cary', 'stcharles', 'lewiston', 'virginiabeach',
    'longbranch', 'indianapolis', 'portales', 'fountainvalley', 'sebastopol', 'washington', 'louisville',
    'millersburg'
]
for keyword in usa_city_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains(keyword)), 'location_country'] = 'usa'
# canada
canada_city_repl = [
    'calgary', 'vancouver', 'toronto', 'ottawa', 'fredericton', 'victoria', 'hamilton', 'montreal',
    'kelowna', 'winnipeg', 'saskatoon', 'halifax', 'edmonton', 'kitchener', 'regina', 'lethbridge',

]
for keyword in canada_city_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains(keyword)), 'location_country'] = 'canada'
# italy
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('milano')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('roma')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('rome')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('genova')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('torino')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('perugia')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('salerno')), 'location_country'] = 'italy'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('firenze')), 'location_country'] = 'italy'
# united kingdom
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('london')), 'location_country'] = 'united kingdom'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('manchester')), 'location_country'] = 'united kingdom'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('cambridge')), 'location_country'] = 'united kingdom'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('york')), 'location_country'] = 'united kingdom'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('birmingham')), 'location_country'] = 'united kingdom'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('edinburgh')), 'location_country'] = 'united kingdom'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('newcastle')), 'location_country'] = 'united kingdom'
# germany
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('hamburg')), 'location_country'] = 'germany'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('berlin')), 'location_country'] = 'germany'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('augsburg')), 'location_country'] = 'germany'
# france
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('paris')), 'location_country'] = 'france'
# spain
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('barcelona')), 'location_country'] = 'spain'
# finland
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('helsinki')), 'location_country'] = 'finland'
# australia
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('melbourne')), 'location_country'] = 'australia'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('sidney')), 'location_country'] = 'australia'
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('canberra')), 'location_country'] = 'australia'
# singapore
users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains('singapore')), 'location_country'] = 'singapore'

'''
모든 null 값을 다 볼 순 없으므로,
user_id 별 데이터 많은 순서대로 null 값 처리
'''
# usa
usa_uids = [
    83671, 179718, 187065, 104278, 146230, 93565, 67663, 84795, 175100,
    273190, 51350, 19493, 226745, 57620, 125031, 113663, 178201, 91631,
    83443, 239535, 135228, 23680, 259264, 209229, 929, 168036, 50129,
    129368, 136465, 8937, 84523, 241749, 48743, 132188, 270897, 171045,
    44842, 115473, 1131, 91017, 68768, 167587, 135411, 30889, 221557,
    39195, 154346, 273110, 29497, 223816, 38718, 175529, 186238, 239449,
    141543, 77676, 258277, 240113, 172486, 34988, 112818, 129474, 46295,
    142041, 268035, 176102, 126985, 93386, 114601, 30650, 24105, 170850,
    28372, 207651, 122802, 129389, 266764, 269140, 50504, 52993, 170208,
    162264, 45641, 226556, 241214

]
for uid in usa_uids:
    users.loc[users['user_id'] == uid, 'location_country'] = 'usa'    
# uk
users.loc[users['user_id'] == 178522, 'location_country'] = 'united kingdom'
users.loc[users['user_id'] == 5476, 'location_country'] = 'united kingdom'
users.loc[users['user_id'] == 237064, 'location_country'] = 'united kingdom'
users.loc[users['user_id'] == 241537, 'location_country'] = 'united kingdom'
# ireland
users.loc[users['user_id'] == 26432, 'location_country'] = 'ireland'
# canada
canada_uids = [44089, 79188, 176100, 34087, 172962, 103160, 206693]
for uid in canada_uids:
    users.loc[users['user_id'] == uid, 'location_country'] = 'canada'
# france
users.loc[users['user_id'] == 179641, 'location_country'] = 'france'
# germany
users.loc[users['user_id'] == 276538, 'location_country'] = 'germany'
users.loc[users['user_id'] == 102169, 'location_country'] = 'germany'
# austria
users.loc[users['user_id'] == 3923, 'location_country'] = 'austria'
users.loc[users['user_id'] == 14393, 'location_country'] = 'austria'
# portugal
users.loc[users['user_id'] == 164581, 'location_country'] = 'portugal'
# australia
users.loc[users['user_id'] == 11399, 'location_country'] = 'australia'
# malaysia
users.loc[users['user_id'] == 30445, 'location_country'] = 'malaysia'
users.loc[users['user_id'] == 28543, 'location_country'] = 'malaysia'
# philippines
users.loc[users['user_id'] == 131023, 'location_country'] = 'philippines'
#########################

#########################
users = users.drop(['location_city', 'location_state'], axis=1)
#########################

ratings = pd.concat([train, test]).reset_index(drop=True)

#출판사
publisher_dict=(books['publisher'].value_counts()).to_dict()
publisher_count_df= pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])
publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False)
modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values
for publisher in modify_list:
    try:
        number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
        right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
        books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
    except: 
        pass

#카테고리
books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())
category_df = pd.DataFrame(books['category'].value_counts()).reset_index()
category_df.columns = ['category','count']  
books['category_high'] = books['category'].copy()
books.loc[books[books['category']=='biography'].index, 'category_high'] = 'biography autobiography'
books.loc[books[books['category']=='autobiography'].index,'category_high'] = 'biography autobiography'
books.loc[books[books['category'].str.contains('history',na=False)].index,'category_high'] = 'history'
categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']
for category in categories:
    books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category
category_high_df = pd.DataFrame(books['category_high'].value_counts()).reset_index()
category_high_df.columns = ['category','count']
# 5개 이하인 항목은 others로 묶어주도록 하겠습니다.
others_list = category_high_df[category_high_df['count']<5]['category'].values
books.loc[books[books['category_high'].isin(others_list)].index, 'category_high']='others'
del books['category']
books.rename(columns = {'category_high':'category'},inplace=True)



# 인덱싱 처리된 데이터 조인
# isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
context_df = ratings.merge(users, on='user_id', how='left').merge(books[['isbn', 'category', 'language', 'book_title','book_author','publisher']], on='isbn', how='left')
train_df = train.merge(users, on='user_id', how='left').merge(books[['isbn', 'category','language', 'book_title','book_author','publisher']], on='isbn', how='left')
test_df = test.merge(users, on='user_id', how='left').merge(books[['isbn', 'category','language', 'book_title','book_author','publisher']], on='isbn', how='left')


######################### age 변수 category 별 평균으로 전처리
cat_age_dict = train_df.groupby('category').mean()['age'].to_dict()

def fill_age(cat):
    return cat_age_dict[cat]

train_df.loc[(train_df['age'].isna()) & (train_df['category'].notna()), 'age'] =\
    train_df.loc[(train_df['age'].isna()) & (train_df['category'].notna()), 'category'].apply(fill_age)


######################### data 수가 n개 이하인 나라 추출
cnty_n = 3
cnty_val_cnt = pd.DataFrame(train_df['location_country'].value_counts())
cnty_val_cnt.loc[cnty_val_cnt['location_country'] <= cnty_n]
countries_to_empty_set = set(cnty_val_cnt.loc[cnty_val_cnt['location_country'] <= 2].index)

######################### 해당 나라들 empty로 때려박음
def one_frq_cnty_to_ety(cnty):
    if cnty in countries_to_empty_set:
        return 'empty'
    return cnty

train_df['location_country'] = train_df['location_country'].apply(one_frq_cnty_to_ety)
test_df['location_country'] = test_df['location_country'].apply(one_frq_cnty_to_ety)
test_df.loc[test_df['location_country'].isna(), 'location_country'] = 'empty'
#########################


# 인덱싱 처리
loc_country2idx = {v:k for k,v in enumerate(context_df['location_country'].unique())}
train_df['location_country'] = train_df['location_country'].map(loc_country2idx)
test_df['location_country'] = test_df['location_country'].map(loc_country2idx)

train_df['age'] = train_df['age'].fillna(int(train_df['age'].mean()))
train_df['age'] = train_df['age'].apply(age_map)

test_df['age'] = test_df['age'].fillna(int(test_df['age'].mean()))
test_df['age'] = test_df['age'].apply(age_map)

# book 파트 인덱싱
category2idx = {v:k for k,v in enumerate(context_df['category'].unique())}
publisher2idx = {v:k for k,v in enumerate(context_df['publisher'].unique())}
language2idx = {v:k for k,v in enumerate(context_df['language'].unique())}
title2idx = {v:k for k,v in enumerate(context_df['book_title'].unique())}
author2idx = {v:k for k,v in enumerate(context_df['book_author'].unique())}

train_df['category'] = train_df['category'].map(category2idx)
train_df['publisher'] = train_df['publisher'].map(publisher2idx)
train_df['language'] = train_df['language'].map(language2idx)
train_df['book_title'] = train_df['book_title'].map(title2idx)
train_df['book_author'] = train_df['book_author'].map(author2idx)

test_df['category'] = test_df['category'].map(category2idx)
test_df['publisher'] = test_df['publisher'].map(publisher2idx)
test_df['language'] = test_df['language'].map(language2idx)
test_df['book_title'] = test_df['book_title'].map(title2idx)
test_df['book_author'] = test_df['book_author'].map(author2idx)

idx = {
    # "loc_city2idx":loc_city2idx,
    # "loc_state2idx":loc_state2idx,
    "loc_country2idx":loc_country2idx,
    "category2idx":category2idx,
    "publisher2idx":publisher2idx,
    "language2idx":language2idx,
    "title2idx" : title2idx,
    "author2idx":author2idx,
}





In [8]:
train.head()

Unnamed: 0,user_id,isbn,rating
0,8,2005018,4
1,67544,2005018,7
2,123629,2005018,8
3,200273,2005018,8
4,210926,2005018,9


In [9]:
train_df

Unnamed: 0,user_id,isbn,rating,age,location_country,category,language,book_title,book_author,publisher
0,8,0002005018,4,3,0.0,0,0,0,0,0
1,67544,0002005018,7,3,0.0,0,0,0,0,0
2,123629,0002005018,8,3,0.0,0,0,0,0,0
3,200273,0002005018,8,3,0.0,0,0,0,0,0
4,210926,0002005018,9,3,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
306790,278843,0743525493,7,2,1.0,5,1,117725,1272,5
306791,278851,067161746X,6,3,1.0,7,0,96973,69,5
306792,278851,0884159221,7,3,1.0,5,1,117726,54713,185
306793,278851,0912333022,7,3,1.0,3,0,117727,54714,139


In [8]:
test_df

Unnamed: 0,user_id,isbn,rating,age,location_country,category,language,book_title,book_author,publisher
0,11676,0002005018,0,3,2.0,0,0,0,0,0
1,116866,0002005018,0,3,0.0,0,0,0,0,0
2,152827,0060973129,0,4,0.0,1,0,1,1,1
3,157969,0374157065,0,3,1.0,2,0,2,2,2
4,67958,0399135782,0,3,1.0,3,0,3,3,3
...,...,...,...,...,...,...,...,...,...,...
76694,278543,1576734218,0,3,1.0,55,0,135432,1275,160
76695,278563,3492223710,0,3,18.0,1,2,135433,10774,69
76696,278633,1896095186,0,3,1.0,3,0,135434,62056,433
76697,278668,8408044079,0,4,13.0,5,1,14530,62057,405


In [95]:
train_df = pd.get_dummies(
    train_df,
    columns=['age','location_country','category','language']
)

In [9]:
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(train_df.drop(['user_id', 'isbn', 'rating'], axis=1), train_df['rating'], test_size=0.2, shuffle=True, random_state=seed)

In [10]:
X_train_cat

Unnamed: 0,age,location_country,category,language,book_title,book_author,publisher
121312,3,1.0,3,0,14311,7508,26
265089,6,1.0,5,1,82672,37826,106
60236,3,1.0,7,0,4642,1808,138
111218,2,1.0,3,0,12255,525,4
306001,4,8.0,5,1,116985,54336,9
...,...,...,...,...,...,...,...
119879,3,1.0,3,0,13972,1325,21
259178,3,1.0,5,1,78323,35784,139
131932,5,1.0,5,1,16902,678,21
146867,2,11.0,3,3,20705,161,34


## second EDA

In [11]:
cbr= CatBoostRegressor()

In [12]:
cbr.fit(X_train_cat, y_train_cat)

Learning rate set to 0.097676
0:	learn: 2.4300413	total: 63.3ms	remaining: 1m 3s
1:	learn: 2.4273343	total: 79ms	remaining: 39.4s
2:	learn: 2.4249663	total: 94.3ms	remaining: 31.4s
3:	learn: 2.4228764	total: 106ms	remaining: 26.4s
4:	learn: 2.4214336	total: 119ms	remaining: 23.7s
5:	learn: 2.4195808	total: 131ms	remaining: 21.7s
6:	learn: 2.4182661	total: 143ms	remaining: 20.3s
7:	learn: 2.4171187	total: 155ms	remaining: 19.2s
8:	learn: 2.4161140	total: 166ms	remaining: 18.2s
9:	learn: 2.4151611	total: 178ms	remaining: 17.6s
10:	learn: 2.4134403	total: 189ms	remaining: 17s
11:	learn: 2.4126464	total: 201ms	remaining: 16.5s
12:	learn: 2.4118917	total: 212ms	remaining: 16.1s
13:	learn: 2.4105815	total: 224ms	remaining: 15.8s
14:	learn: 2.4099151	total: 236ms	remaining: 15.5s
15:	learn: 2.4092927	total: 247ms	remaining: 15.2s
16:	learn: 2.4086572	total: 259ms	remaining: 15s
17:	learn: 2.4077259	total: 270ms	remaining: 14.7s
18:	learn: 2.4073507	total: 281ms	remaining: 14.5s
19:	learn: 2.4

<catboost.core.CatBoostRegressor at 0x7f6c5d23fa90>

In [13]:
preds = cbr.predict(X_test_cat)
mean_squared_error(y_test_cat, preds)**0.5

2.36698312947468

In [14]:
df = test_df.drop(['user_id', 'isbn', 'rating'], axis=1)

In [15]:
preds = cbr.predict(df)

In [33]:
def adjust_predict(y):
    if y < 1.0:
        return 1.0
    elif y > 10.0:
        return 10.0
    return y

In [34]:
preds = list(map(adjust_predict, preds))
test['rating'] = preds

In [35]:
test.to_csv("catboost_ed3_feature_add.csv", index = False)

In [21]:
test[test['rating']<5]

Unnamed: 0,user_id,isbn,rating
34,187533,0440234743,4.887578
65,4784,0971880107,3.279238
66,8453,0971880107,3.476602
67,11554,0971880107,3.157368
68,12264,0971880107,3.157368
...,...,...,...
75276,243785,0582525063,4.370284
75572,248957,9500422298,4.078191
76403,269629,1888237007,4.655220
76496,272415,0387948791,3.324610


# 하이퍼파라미터 튜닝

In [39]:
from sklearn.model_selection import KFold, GridSearchCV, train_test_split

In [40]:
cbr2= CatBoostRegressor()

In [47]:
parameters = {'depth'         : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [30, 50, 100] 
             }

In [48]:
grid = GridSearchCV(estimator=cbr2, param_grid = parameters, cv = 2, n_jobs=-1)
grid.fit(X_train_cat, y_train_cat)

0:	learn: 2.4324372	total: 80.4ms	remaining: 2.33s
1:	learn: 2.4320753	total: 93.8ms	remaining: 1.31s
2:	learn: 2.4317356	total: 202ms	remaining: 1.82s
3:	learn: 2.4313951	total: 231ms	remaining: 1.5s
4:	learn: 2.4310783	total: 248ms	remaining: 1.24s
5:	learn: 2.4307688	total: 287ms	remaining: 1.15s
6:	learn: 2.4304461	total: 345ms	remaining: 1.14s
7:	learn: 2.4301514	total: 439ms	remaining: 1.21s
8:	learn: 2.4298323	total: 498ms	remaining: 1.16s
9:	learn: 2.4295330	total: 567ms	remaining: 1.13s
10:	learn: 2.4292166	total: 650ms	remaining: 1.12s
11:	learn: 2.4289415	total: 729ms	remaining: 1.09s
12:	learn: 2.4286494	total: 782ms	remaining: 1.02s
13:	learn: 2.4283665	total: 816ms	remaining: 932ms
14:	learn: 2.4280867	total: 846ms	remaining: 846ms
15:	learn: 2.4278392	total: 936ms	remaining: 819ms
16:	learn: 2.4275056	total: 978ms	remaining: 748ms
17:	learn: 2.4272502	total: 1.01s	remaining: 675ms
18:	learn: 2.4269853	total: 1.07s	remaining: 619ms
19:	learn: 2.4267381	total: 1.12s	remain

In [49]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostRegressor object at 0x7f6c8dd25f10>

 The best score across ALL searched params:
 0.03953922319368397

 The best parameters across ALL searched params:
 {'depth': 10, 'iterations': 100, 'learning_rate': 0.1}


In [52]:
cbr3 = CatBoostRegressor(depth=10, iterations=100, learning_rate=0.1)

In [53]:
cbr3.fit(X_train_cat, y_train_cat)

0:	learn: 2.4287142	total: 19.4ms	remaining: 1.92s
1:	learn: 2.4251260	total: 37.8ms	remaining: 1.85s
2:	learn: 2.4212894	total: 55.7ms	remaining: 1.8s
3:	learn: 2.4181647	total: 72.5ms	remaining: 1.74s
4:	learn: 2.4154493	total: 91.8ms	remaining: 1.74s
5:	learn: 2.4128245	total: 110ms	remaining: 1.72s
6:	learn: 2.4106774	total: 128ms	remaining: 1.7s
7:	learn: 2.4088176	total: 147ms	remaining: 1.69s
8:	learn: 2.4073086	total: 165ms	remaining: 1.67s
9:	learn: 2.4051850	total: 182ms	remaining: 1.64s
10:	learn: 2.4038387	total: 200ms	remaining: 1.62s
11:	learn: 2.4026537	total: 217ms	remaining: 1.59s
12:	learn: 2.4017296	total: 236ms	remaining: 1.58s
13:	learn: 2.4006524	total: 254ms	remaining: 1.56s
14:	learn: 2.3997871	total: 271ms	remaining: 1.54s
15:	learn: 2.3989080	total: 292ms	remaining: 1.53s
16:	learn: 2.3981042	total: 312ms	remaining: 1.52s
17:	learn: 2.3974599	total: 335ms	remaining: 1.52s
18:	learn: 2.3961188	total: 362ms	remaining: 1.54s
19:	learn: 2.3953575	total: 380ms	rema

<catboost.core.CatBoostRegressor at 0x7f6c8dd37310>

In [55]:
preds3 = cbr.predict(X_test_cat)
mean_squared_error(y_test_cat, preds3)**0.5

2.36698312947468

In [14]:
# !pip install optuna
import catboost
import optuna

def objective(trial):
    X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(train_df.drop(['user_id', 'isbn', 'rating'], axis=1), train_df['rating'], test_size=0.2, shuffle=True, random_state=42)

    param = {
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.3),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "max_depth": trial.suggest_int("max_depth", 1, 15),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
    }
    

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_uniform("subsample", 0.1, 1)

    gbm = CatBoostRegressor(**param, iterations = 10000)

    gbm.fit(X_train_cat, y_train_cat, eval_set = [(X_test_cat, y_test_cat)], verbose = 0, early_stopping_rounds = 100)

    preds = gbm.predict(X_test_cat)
    rmse = mean_squared_error(y_test_cat, pred_labels)
    
    return rmse

study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 200, show_progress_bar = True)

[32m[I 2022-11-03 05:25:10,121][0m A new study created in memory with name: no-name-0efa1a5f-c444-453f-8c8e-888724399131[0m


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=200.0), HTML(value='')))

[33m[W 2022-11-03 05:26:34,942][0m Trial 0 failed because of the following error: NameError("name 'pred_labels' is not defined")[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_18821/3226985462.py", line 28, in objective
    rmse = mean_squared_error(y_test_cat, pred_labels)
NameError: name 'pred_labels' is not defined



NameError: name 'pred_labels' is not defined