In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, linalg
# !pip install xgboost
from xgboost import XGBRegressor, XGBClassifier
# !pip install lightgbm
from lightgbm import LGBMRegressor, LGBMClassifier
# !pip install catboost
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings(action='ignore')

In [8]:
books = pd.read_csv("data/books.csv")
test = pd.read_csv("data/test_ratings.csv")
users = pd.read_csv("data/users.csv")
sub = pd.read_csv("data/sample_submission.csv")

In [9]:
seed=42

In [10]:
users['location_city'] = users['location'].apply(lambda x: x.split(',')[0])
users['location_state'] = users['location'].apply(lambda x: x.split(',')[1])
users['location_country'] = users['location'].apply(lambda x: x.split(',')[2])
users = users.drop(['location'], axis=1)



######################### location 전처리
users['location_city'] = users['location_city'].str.strip()
users['location_state'] = users['location_state'].str.strip()
users['location_country'] = users['location_country'].str.strip()

users['location_city'] = users['location_city'].str.replace(r'[^a-zA-Z]', '', regex=True)
users['location_state'] = users['location_state'].str.replace(r'[^a-zA-Z]', '', regex=True)
users['location_country'] = users['location_country'].str.replace(r'[^a-zA-Z]', '', regex=True)

'''
location_country
'''
# null & na & universe & etc
null_repl = [
    'universe', 'na', '', 'lava', 'petrolwarnation', 'space', 'lachineternelle',
    'faraway', 'everywhereandanywhere', 'hereandthere', 'tdzimi', 'naontheroad',
    'unknown'
]
for keyword in null_repl:
    users.loc[users['location_country'] == keyword, 'location_country'] = 'null'
users.loc[users['location_country'] == 'c', 'location_country'] = 'null'
# australia
australia_repl = [
    'newsouthwales', 'queensland', 'tasmania', 'victoria', 'nsw'
]
for keyword in australia_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'australia'
# italy
users.loc[users['location_country'].str.contains('ital'), 'location_country'] = 'italy'
users.loc[users['location_country'].str.contains('ferrara'), 'location_country'] = 'italy'
users.loc[users['location_country'].str.contains('veneziagiulia'), 'location_country'] = 'italy'
users.loc[users['location_country'].str.contains('ineurope'), 'location_country'] = 'italy'
# germany
users.loc[users['location_country'].str.contains('deut'), 'location_country'] = 'germany'
users.loc[users['location_country'].str.contains('germ'), 'location_country'] = 'germany'
users.loc[users['location_country'].str.contains('berlin'), 'location_country'] = 'germany'
users.loc[users['location_country'].str.contains('niedersachsen'), 'location_country'] = 'germany'
# united kingdom
uk_repls = [
    'unitedkingdom', 'eng', 'king', 'wales', 'scotland', 'aberdeenshire', 'camden', 'unitedkindgonm',
    'middlesex', 'nottinghamshire', 'westyorkshire', 'cambridgeshire', 'sthelena', 'northyorkshire',
    'obviously'
]
for keyword in uk_repls:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'united kingdom'
users.loc[users['location_country'] == 'uk', 'location_country'] = 'united kingdom'
# ireland
users.loc[users['location_country'].str.contains('countycork'), 'location_country'] = 'ireland'
users.loc[users['location_country'].str.contains('cocarlow'), 'location_country'] = 'ireland'
# france
users.loc[users['location_country'].str.contains('fran'), 'location_country'] = 'france'
users.loc[users['location_country'].str.contains('paris'), 'location_country'] = 'france'
# spain
spain_repl = [
    'esp', 'catal', 'galiza', 'euskalherria', 'lleida', 'gipuzkoa', 'orense', 'pontevedra', 'almera',
    'bergued', 'andalucia'
]
for keyword in spain_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'spain'
# portugal
users.loc[users['location_country'].str.contains('oeiras'), 'location_country'] = 'portugal'
# belgium
users.loc[users['location_country'].str.contains('labelgique'), 'location_country'] = 'belgium'
# austria
users.loc[users['location_country'].str.contains('eu'), 'location_country'] = 'austria'
# swiss
users.loc[users['location_country'].str.contains('lasuisse'), 'location_country'] = 'switzerland'
# finland
users.loc[users['location_country'].str.contains('etelsuomi'), 'location_country'] = 'finland'
# usa
usa_repl = [
    'unitedstaes', 'america', 'usa', 'state', 'sate', 'cali', 'dc', 'oregon', 'texas', 'florida',
    'newhampshire', 'newmexico', 'newjersey', 'newyork', 'virginia', 'bermuda', 'illinois', 'michigan',
    'arizona', 'indiana', 'minnesota', 'tennessee', 'dakota', 'connecticut', 'wisconsin', 'ohio',
    'maryland', 'northcarolina', 'massachusetts', 'colorado', 'washington', 'maine', 'georgia', 'oklahoma',
    'maracopa', 'districtofcolumbia', 'saintloius', 'orangeco', 'aroostook', 'arkansas', 'montana',
    'rhodeisland', 'nevada', 'kern', 'fortbend', 'nebraska', 'usofa', 'alabama', 'csa', 'polk',
    'alachua', 'austin', 'alaska', 'hawaii', 'worcester', 'iowa', 'cherokee', 'shelby', 'stthomasi',
    'vanwert', 'kansas', 'idaho', 'tn', 'framingham', 'pender', 'ysa', 'arizona', 'morgan', 'rutherford'
]
for keyword in usa_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'usa'
users.loc[users['location_country'] == 'us', 'location_country'] = 'usa'
users.loc[users['location_country'] == 'ca', 'location_country'] = 'usa'
users.loc[users['location_country'] == 'il', 'location_country'] = 'usa'
users.loc[users['location_country'] == 'ua', 'location_country'] = 'usa'
# cananda
canada_repl = [
    'cananda', 'british', 'newfoundland', 'newbrunswick', 'alberta', 'ontario', 'lkjlj', 'bc',
    'novascotia', 'kcb', 'quebec', 'maricopa', 'travelling', 'vvh', 'saskatchewan'
]
for keyword in canada_repl:
    users.loc[users['location_country'].str.contains(keyword), 'location_country'] = 'canada'
# new zealand
users.loc[users['location_country'] == 'nz', 'location_country'] = 'newzealand'
users.loc[users['location_country'].str.contains('otago'), 'location_country'] = 'newzealand'
users.loc[users['location_country'].str.contains('auckland'), 'location_country'] = 'newzealand'
# malaysia
users.loc[users['location_country'].str.contains('kedah'), 'location_country'] = 'malaysia'
# uae
users.loc[users['location_country'].str.contains('uae'), 'location_country'] = 'unitedarabemirates'
# kuwait
users.loc[users['location_country'].str.contains('quit'), 'location_country'] = 'kuwait'
# phillipines
users.loc[users['location_country'].str.contains('phill'), 'location_country'] = 'philippines'
users.loc[users['location_country'].str.contains('metromanila'), 'location_country'] = 'philippines'
# uruguay
users.loc[users['location_country'].str.contains('urugua'), 'location_country'] = 'uruguay'
# panama
users.loc[users['location_country'].str.contains('republicofpanama'), 'location_country'] = 'panama'
# trinidadandtobago
users.loc[users['location_country'].str.contains('westindies'), 'location_country'] = 'trinidadandtobago'
# guernsey
users.loc[users['location_country'].str.contains('alderney'), 'location_country'] = 'guernsey'
# japan
users.loc[users['location_country'].str.contains('okinawa'), 'location_country'] = 'japan'
# korea
users.loc[users['location_country'].str.contains('seoul'), 'location_country'] = 'southkorea'
# brazil
users.loc[users['location_country'].str.contains('disritofederal'), 'location_country'] = 'brazil'


'''
location_city
'''
# usa
usa_city_repl = [
    'losang', 'seattle', 'sanf', 'sand', 'newyork', 'newark', 'newbedford'
]
for keyword in usa_city_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains(keyword)), 'location_country'] = 'usa'
# canada
canada_city_repl = [
    'calgary', 'vancouver',
]
for keyword in canada_city_repl:
    users.loc[(users['location_country'] == 'null') & (users['location_city'].str.contains(keyword)), 'location_country'] = 'canada'
#########################

#########################
users = users.drop(['location_city', 'location_state'], axis=1)
#########################


In [11]:
users.head()

Unnamed: 0,user_id,age,location_country
0,8,,canada
1,11400,49.0,canada
2,11676,,
3,67544,30.0,canada
4,85526,36.0,canada


In [12]:
users = users.replace('null', np.nan)
users = users.replace('na', np.nan)
users = users.replace('', np.nan)

In [13]:
#출판사
import re
publisher_dict=(books['publisher'].value_counts()).to_dict()
publisher_count_df= pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])

publisher_count_df = publisher_count_df.sort_values(by=['count'], ascending = False)
modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values

for publisher in modify_list:
    try:
        number = books[books['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
        right_publisher = books[books['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
        books.loc[books[books['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
    except: 
        pass

#카테고리
books.loc[books[books['category'].notnull()].index, 'category'] = books[books['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+',' ',x).strip())


category_df = pd.DataFrame(books['category'].value_counts()).reset_index()
category_df.columns = ['category','count']  

books['category_high'] = books['category'].copy()
books.loc[books[books['category']=='biography'].index, 'category_high'] = 'biography autobiography'
books.loc[books[books['category']=='autobiography'].index,'category_high'] = 'biography autobiography'

books.loc[books[books['category'].str.contains('history',na=False)].index,'category_high'] = 'history'

categories = ['garden','crafts','physics','adventure','music','fiction','nonfiction','science','science fiction','social','homicide',
'sociology','disease','religion','christian','philosophy','psycholog','mathemat','agricult','environmental',
'business','poetry','drama','literary','travel','motion picture','children','cook','literature','electronic',
'humor','animal','bird','photograph','computer','house','ecology','family','architect','camp','criminal','language','india']

for category in categories:
    books.loc[books[books['category'].str.contains(category,na=False)].index,'category_high'] = category

category_high_df = pd.DataFrame(books['category_high'].value_counts()).reset_index()
category_high_df.columns = ['category','count']

# 5개 이하인 항목은 others로 묶어주도록 하겠습니다.
others_list = category_high_df[category_high_df['count']<5]['category'].values
books.loc[books[books['category_high'].isin(others_list)].index, 'category_high']='others'

In [14]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149570 entries, 0 to 149569
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   isbn                 149570 non-null  object 
 1   book_title           149570 non-null  object 
 2   book_author          149570 non-null  object 
 3   year_of_publication  149570 non-null  float64
 4   publisher            149570 non-null  object 
 5   img_url              149570 non-null  object 
 6   language             82343 non-null   object 
 7   category             80719 non-null   object 
 8   summary              82343 non-null   object 
 9   img_path             149570 non-null  object 
 10  category_high        80719 non-null   object 
dtypes: float64(1), object(10)
memory usage: 12.6+ MB


In [15]:
users_df = pd.get_dummies(
    users,
    columns=['location_country']
)
users_df.head()

Unnamed: 0,user_id,age,location_country_afghanistan,location_country_albania,location_country_algeria,location_country_andorra,location_country_angola,location_country_antarctica,location_country_antiguaandbarbuda,location_country_argentina,...,location_country_ukraine,location_country_united kingdom,location_country_unitedarabemirates,location_country_uruguay,location_country_usa,location_country_uzbekistan,location_country_venezuela,location_country_yugoslavia,location_country_zambia,location_country_zimbabwe
0,8,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,11400,49.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11676,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,67544,30.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,85526,36.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
books.columns

Index(['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path',
       'category_high'],
      dtype='object')

In [17]:
books = books[['isbn','category_high','publisher','language']]

In [18]:
# 간결성을 위해 category 첫번째만 사용
# books_cat = (pd.concat([books, books['category'].str.replace(r'[^0-9a-zA-Z:,]+', '').str.split(',', expand=True)], axis=1)
#   .drop(['category',1,2,3], axis=1).rename(columns={0:'category'}))
books_df = pd.get_dummies(
  books, 
  columns=['category_high', 'publisher', 'language']
)
books_df.head()

Unnamed: 0,isbn,category_high_AIDS Disease,category_high_Abduction,category_high_Aboriginal Australians,category_high_Abortion,category_high_Abused wives,category_high_Accelerated readers,category_high_Accidents,category_high_Achievement motivation,category_high_Acting,...,language_ms,language_nl,language_no,language_pt,language_ro,language_ru,language_th,language_vi,language_zh-CN,language_zh-TW
0,2005018,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60973129,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,374157065,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,399135782,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,425176428,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
ratings = pd.read_csv("data/train_ratings.csv")

In [22]:
ratings['rating'] = ratings['rating']-1

In [23]:
ratings

Unnamed: 0,user_id,isbn,rating
0,8,0002005018,3
1,67544,0002005018,6
2,123629,0002005018,7
3,200273,0002005018,7
4,210926,0002005018,8
...,...,...,...
306790,278843,0743525493,6
306791,278851,067161746X,5
306792,278851,0884159221,6
306793,278851,0912333022,6


In [24]:
data = ratings.merge(books_df, on='isbn', how='inner').merge(users_df, on='user_id', how='inner')

# catboost에 사용하기 위해 one-hot encoding 형태가 아닌 카테고리 변수를 그대로 사용하는 dataframe
# data_cat = ratings.merge(books_cat.drop(['book_title', 'book_author'], axis=1), on='isbn', how='inner').merge(users, on='user_id', how='inner')

In [25]:
data

Unnamed: 0,user_id,isbn,rating,category_high_AIDS Disease,category_high_Abduction,category_high_Aboriginal Australians,category_high_Abortion,category_high_Abused wives,category_high_Accelerated readers,category_high_Accidents,...,location_country_ukraine,location_country_united kingdom,location_country_unitedarabemirates,location_country_uruguay,location_country_usa,location_country_uzbekistan,location_country_venezuela,location_country_yugoslavia,location_country_zambia,location_country_zimbabwe
0,8,0002005018,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,074322678X,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8,0887841740,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,1552041778,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,1567407781,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306790,278411,0446608831,7,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
306791,278621,1550390961,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306792,278636,0375507299,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
306793,278659,0345330293,9,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [34]:
import regex

In [40]:
for i in data.columns.values:
    print(i)

user_id
isbn
rating
category_high_AIDS Disease
category_high_Abduction
category_high_Aboriginal Australians
category_high_Abortion
category_high_Abused wives
category_high_Accelerated readers
category_high_Accidents
category_high_Achievement motivation
category_high_Acting
category_high_Actors
category_high_Actresses
category_high_Adolescence
category_high_Adoptees
category_high_Adoption
category_high_Adultery
category_high_Adventure
category_high_Adventure stories
category_high_Adventure stories American
category_high_Adventure stories English
category_high_Advertising
category_high_Aeronautics
category_high_Africa
category_high_African American authors
category_high_African American families
category_high_African American men
category_high_African American women
category_high_African Americans
category_high_Aging
category_high_Air pilots
category_high_Alaska
category_high_Alchemists
category_high_Alcoholics
category_high_Aliens
category_high_Almanacs
category_high_Alphabet
category_h

In [35]:
data.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in data.columns.values]

TypeError: sub() missing 1 required positional argument: 'string'

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(['user_id', 'isbn', 'rating'], axis=1), data['rating'], 
                                                    test_size=0.2, shuffle=True, random_state=seed)

# X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(data_cat.drop(['user_id', 'isbn', 'rating'], axis=1), data_cat['rating'], test_size=0.2, shuffle=True, random_state=seed)

In [27]:
ratings

Unnamed: 0,user_id,isbn,rating
0,8,0002005018,3
1,67544,0002005018,6
2,123629,0002005018,7
3,200273,0002005018,7
4,210926,0002005018,8
...,...,...,...
306790,278843,0743525493,6
306791,278851,067161746X,5
306792,278851,0884159221,6
306793,278851,0912333022,6


In [28]:
X_train

Unnamed: 0,category_high_AIDS Disease,category_high_Abduction,category_high_Aboriginal Australians,category_high_Abortion,category_high_Abused wives,category_high_Accelerated readers,category_high_Accidents,category_high_Achievement motivation,category_high_Acting,category_high_Actors,...,location_country_ukraine,location_country_united kingdom,location_country_unitedarabemirates,location_country_uruguay,location_country_usa,location_country_uzbekistan,location_country_venezuela,location_country_yugoslavia,location_country_zambia,location_country_zimbabwe
121312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
265089,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
60236,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
111218,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
306001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
259178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
131932,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146867,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
xgb_cl = XGBClassifier()

In [30]:
# Fit
xgb_cl.fit(X_train, y_train)

ValueError: feature_names must be string, and may not contain [, ] or <

In [None]:
# Predict
preds = xgb_cl.predict(X_test)

In [None]:
# Score
accuracy_scaore(y_test, preds)

In [None]:
print('RMSE : ', rmse(y_test, preds))
print('MAE : ', mae(y_test, preds))

In [41]:
lgbm_cl = LGBMClassifier()

In [42]:
# Fit
lgbm_cl.fit(X_train, y_train)

[LightGBM] [Fatal] Do not support special JSON characters in feature name.


LightGBMError: Do not support special JSON characters in feature name.

In [None]:
# Predict
preds = lgbm_cl.predict(X_test)

In [None]:
# Score
accuracy_score(y_test, preds)

In [None]:
print('RMSE : ', rmse(y_test, preds))
print('MAE : ', mae(y_test, preds))

In [43]:
catboost_cl = CatBoostClassifier()

In [44]:
catboost_cl.fit(X_train, y_train)

Learning rate set to 0.104764
0:	learn: 2.2545641	total: 139ms	remaining: 2m 18s
1:	learn: 2.2180319	total: 242ms	remaining: 2m
2:	learn: 2.1894058	total: 328ms	remaining: 1m 49s
3:	learn: 2.1666741	total: 418ms	remaining: 1m 44s
4:	learn: 2.1488931	total: 512ms	remaining: 1m 41s
5:	learn: 2.1345951	total: 620ms	remaining: 1m 42s
6:	learn: 2.1225316	total: 731ms	remaining: 1m 43s
7:	learn: 2.1127538	total: 839ms	remaining: 1m 44s
8:	learn: 2.1047061	total: 955ms	remaining: 1m 45s
9:	learn: 2.0978966	total: 1.05s	remaining: 1m 44s
10:	learn: 2.0920715	total: 1.14s	remaining: 1m 42s
11:	learn: 2.0878282	total: 1.23s	remaining: 1m 41s
12:	learn: 2.0838933	total: 1.33s	remaining: 1m 40s
13:	learn: 2.0798752	total: 1.42s	remaining: 1m 40s
14:	learn: 2.0768010	total: 1.51s	remaining: 1m 39s
15:	learn: 2.0743610	total: 1.6s	remaining: 1m 38s
16:	learn: 2.0725124	total: 1.68s	remaining: 1m 37s
17:	learn: 2.0711353	total: 1.77s	remaining: 1m 36s
18:	learn: 2.0697156	total: 1.86s	remaining: 1m 3

<catboost.core.CatBoostClassifier at 0x7fb2a7188bb0>

In [45]:
# Predict
preds = catboost_cl.predict(X_test)

In [46]:
preds

array([[7],
       [7],
       [7],
       ...,
       [7],
       [7],
       [7]])

In [49]:
accuracy_score(y_test, preds)

0.24826023892175558

In [47]:
cbr= CatBoostRegressor()

In [48]:
cbr.fit(X_train, y_train)

Learning rate set to 0.097676
0:	learn: 2.4290296	total: 13.3ms	remaining: 13.3s
1:	learn: 2.4258226	total: 26.2ms	remaining: 13.1s
2:	learn: 2.4234257	total: 38.2ms	remaining: 12.7s
3:	learn: 2.4213291	total: 51.2ms	remaining: 12.8s
4:	learn: 2.4193641	total: 63.3ms	remaining: 12.6s
5:	learn: 2.4177258	total: 76.1ms	remaining: 12.6s
6:	learn: 2.4164189	total: 88.3ms	remaining: 12.5s
7:	learn: 2.4153039	total: 100ms	remaining: 12.5s
8:	learn: 2.4142521	total: 114ms	remaining: 12.5s
9:	learn: 2.4132682	total: 127ms	remaining: 12.6s
10:	learn: 2.4122981	total: 139ms	remaining: 12.5s
11:	learn: 2.4115575	total: 151ms	remaining: 12.5s
12:	learn: 2.4108673	total: 163ms	remaining: 12.4s
13:	learn: 2.4102682	total: 175ms	remaining: 12.3s
14:	learn: 2.4092662	total: 187ms	remaining: 12.3s
15:	learn: 2.4084355	total: 199ms	remaining: 12.3s
16:	learn: 2.4079132	total: 213ms	remaining: 12.3s
17:	learn: 2.4073956	total: 226ms	remaining: 12.3s
18:	learn: 2.4070258	total: 238ms	remaining: 12.3s
19:	

<catboost.core.CatBoostRegressor at 0x7fb2a49964c0>

In [50]:
preds = catboost_cl.predict(X_test)
accuracy_score(y_test, preds)

0.24826023892175558

In [51]:
catboost_cl.predict(test)

CatBoostError: Bad value for num_feature[non_default_doc_idx=372,feature_idx=1]="042518630X": Cannot convert 'b'042518630X'' to float