In [1]:
import numpy as np 
import pandas as pd 
from catboost import * 
from PIL import Image 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split
import os
import torch
import torch.nn as nn
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def change_age(x):
	if x < 10: return 0
	elif 10 <= x < 20: return 1
	elif 20 <= x < 30: return 2
	elif 30 <= x < 40: return 3
	elif 40 <= x < 50: return 4
	elif 50 <= x < 60: return 5
	elif 60 <= x < 70: return 6
	elif 70 <= x < 100: return 7
	else: return 8

In [7]:
dpath = 'data/v4/'
users = pd.read_csv(os.path.join(dpath,'users.csv'))
books = pd.read_csv(os.path.join(dpath,'books.csv'))
train = pd.read_csv(os.path.join(dpath,'train_ratings.csv'))
test = pd.read_csv(os.path.join(dpath,'test_ratings.csv'))
submission = pd.read_csv(os.path.join(dpath,'sample_submission.csv'))

In [8]:
users['age'] = users['age'].apply(change_age)

In [9]:
ids = pd.concat([train['user_id'], submission['user_id']]).unique()
isbns = pd.concat([train['isbn'], submission['isbn']]).unique()

idx2user = {idx:id for idx, id in enumerate(ids)}
idx2isbn = {idx:isbn for idx, isbn in enumerate(isbns)}

user2idx = {id:idx for idx, id in idx2user.items()}
isbn2idx = {isbn:idx for idx, isbn in idx2isbn.items()}

users_ = users.copy()
books_ = books.copy()
books_ = books_.drop(['img_url','img_path'],axis=1)

In [10]:
train = pd.merge(train, users_, on='user_id', how='left')
submission = pd.merge(submission, users_, on='user_id', how='left')
test = pd.merge(test, users_, on='user_id', how='left')

train = pd.merge(train, books_, on='isbn', how='left')
submission = pd.merge(submission, books_, on='isbn', how='left')
test = pd.merge(test, books_, on='isbn', how='left')

train['user_id'] = train['user_id'].map(user2idx)
submission['user_id'] = submission['user_id'].map(user2idx)
test['user_id'] = test['user_id'].map(user2idx)

train['isbn'] = train['isbn'].map(isbn2idx)
submission['isbn'] = submission['isbn'].map(isbn2idx)
test['isbn'] = test['isbn'].map(isbn2idx)

In [11]:
train['year_of_publication'] = train['year_of_publication'].astype(int)
submission['year_of_publication'] = submission['year_of_publication'].astype(int)
test['year_of_publication'] = test['year_of_publication'].astype(int)

In [12]:
train = train.fillna('-1')
submission = submission.fillna('-1')
test = test.fillna('-1')

sub_rating = submission['rating']
submission = submission.drop(columns='rating')
submission['rating'] = sub_rating


In [14]:
X_train, X_valid, y_train, y_valid =  train_test_split(train.drop(['rating'],axis=1), train['rating'], test_size=0.2, random_state=42,shuffle=True)

In [15]:
def rmse(real: list, predict: list) -> float:
    pred = np.array(predict)
    return np.sqrt(np.mean((real-pred) ** 2))

class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.eps = 1e-6

    def forward(self, x, y):
        criterion = nn.MSELoss()
        loss = torch.sqrt(criterion(x, y)+self.eps)
        return loss

In [16]:
X_train

Unnamed: 0,user_id,isbn,age,location_city,location_state,location_country,rating_count,book_title,book_author,year_of_publication,publisher,language,category,summary,category_high
121312,22946,15310,4,arlington,virginia,usa,22,Grendel,John Champlin Gardner,1989,Vintage Books USA,en,fiction,The Beowulf story retold from the monster&#39;...,fiction
265089,44452,91405,6,portland,oregon,usa,5,Solomon's Decision,Judith B. Glad,2002,Tokyopop,en,-1,-1,-1
60236,15913,4800,3,fortcollins,colorado,usa,9,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,Scribner,en,humor,A whimsical adaptation of classic fairy tales ...,humor
111218,1339,13043,2,badaxe,michigan,usa,113,Eyes of Prey,John Sandford,2004,Berkley Publishing Group,en,fiction,When a series of gruesome mutilation killings ...,fiction
306001,15663,128985,4,amora,estremadura,estremadura,71,Culture Shock!: South Africa,Dee Rissik,1993,Health Communications,en,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,2752,14936,3,fpo,ap,usa,174,The Genesis Code,John Case,1998,Ballantine Books,en,fiction,"In a suburb of Washington, D.C., Joe Lassiter ...",fiction
259178,8108,86566,3,newalbany,indiana,usa,4,The Marriage Dictionary,Tom Carey,1995,Pub Group West,en,-1,-1,-1
131932,10534,18168,5,westbloomfield,michigan,usa,8,Billy Straight : A Novel,JONATHAN KELLERMAN,1999,Ballantine Books,en,-1,-1,-1
146867,40246,22419,2,noisylegrand,seinestdenis,france,1,La Vie est ailleurs,Milan Kundera,1976,Gallimard,fr,fiction,"Postface, p. 397-405, écrite en 1978 (Lire Kun...",fiction


In [17]:
X_train.shape[1]

15

In [25]:
cat_features = [0,1,2,3,4,5,7,8,9,10,11,12,13,14]
model = CatBoostRegressor(iterations=10000, depth=6, learning_rate=0.05, random_seed=42, verbose=50)
model.fit(X_train,
          y_train,
          cat_features = cat_features,
          eval_set = (X_valid,y_valid),
          use_best_model=True)

0:	learn: 2.4174387	test: 2.4137792	best: 2.4137792 (0)	total: 105ms	remaining: 17m 26s
50:	learn: 2.2219653	test: 2.1727156	best: 2.1727156 (50)	total: 5.06s	remaining: 16m 26s
100:	learn: 2.2066633	test: 2.1541488	best: 2.1541488 (100)	total: 16.3s	remaining: 26m 37s
150:	learn: 2.2010505	test: 2.1492847	best: 2.1492847 (150)	total: 28.1s	remaining: 30m 34s
200:	learn: 2.1973047	test: 2.1462854	best: 2.1462854 (200)	total: 40.5s	remaining: 32m 52s
250:	learn: 2.1944491	test: 2.1440590	best: 2.1440590 (250)	total: 52.6s	remaining: 34m 4s
300:	learn: 2.1916945	test: 2.1422863	best: 2.1422863 (300)	total: 1m 5s	remaining: 35m 16s
350:	learn: 2.1896382	test: 2.1410928	best: 2.1410928 (350)	total: 1m 17s	remaining: 35m 36s
400:	learn: 2.1876262	test: 2.1398966	best: 2.1398966 (400)	total: 1m 30s	remaining: 36m 3s
450:	learn: 2.1857135	test: 2.1386556	best: 2.1386556 (450)	total: 1m 42s	remaining: 36m 12s
500:	learn: 2.1841347	test: 2.1378017	best: 2.1378014 (499)	total: 1m 55s	remaining: 

<catboost.core.CatBoostRegressor at 0x7fb6fa367130>

In [20]:
model.get_best_score()

{'learn': {'RMSE': 2.1737556982842934},
 'validation': {'RMSE': 2.134623484111431}}

In [26]:
pred = model.predict(submission.drop(['rating'],axis=1))
len(pred)

76699

In [27]:
sample_submission = pd.read_csv(os.path.join(dpath,'sample_submission.csv'))
len(sample_submission)

76699

In [38]:
pred_submission = submission.copy()
pred_submission['rating'] = pred

In [28]:
sample_submission['rating'] = pred

In [58]:
os.getcwd()

'/opt/ml'

In [29]:
sample_submission.to_csv('code/submit/catboost_v4_1_submission.csv',index=False)

In [50]:
pred_submission['user_id'].map(idx2user)

0         11676
1        116866
2        152827
3        157969
4         67958
          ...  
76694    278543
76695    278563
76696    278633
76697    278668
76698    278851
Name: user_id, Length: 76699, dtype: int64

In [54]:
pred_submission['isbn'].map(idx2isbn)

0        0002005018
1        0002005018
2        0060973129
3        0374157065
4        0399135782
            ...    
76694    1576734218
76695    3492223710
76696    1896095186
76697    8408044079
76698    0767907566
Name: isbn, Length: 76699, dtype: object

In [70]:
model.feature_importances_

array([6.03397012e+01, 4.42881231e-02, 1.87064969e+00, 2.12270226e+00,
       7.04342400e+00, 1.60583707e+01, 2.03785915e+00, 7.26898546e+00,
       6.49503360e-01, 2.39280077e+00, 1.71715328e-01])

In [69]:
X_train.columns

Index(['user_id', 'isbn', 'location', 'age', 'book_title', 'book_author',
       'year_of_publication', 'publisher', 'language', 'category', 'summary'],
      dtype='object')

In [71]:
pd.DataFrame({'feature':X_train.columns,'importance':model.feature_importances_}).sort_values(by='importance',ascending=False)

Unnamed: 0,feature,importance
0,user_id,60.339701
5,book_author,16.058371
7,publisher,7.268985
4,book_title,7.043424
9,category,2.392801
3,age,2.122702
6,year_of_publication,2.037859
2,location,1.87065
8,language,0.649503
10,summary,0.171715


In [75]:
len(os.listdir('data/images/'))

149570