In [1]:
!pip install -q kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c predictingbookratingsint20h
!unzip train.csv.zip
!unzip test.csv.zip

Downloading test.csv.zip to /content
  0% 0.00/1.65M [00:00<?, ?B/s]
100% 1.65M/1.65M [00:00<00:00, 54.8MB/s]
Downloading submission.csv to /content
  0% 0.00/25.5k [00:00<?, ?B/s]
100% 25.5k/25.5k [00:00<00:00, 22.9MB/s]
Downloading train.csv.zip to /content
 51% 9.00M/17.5M [00:00<00:00, 22.5MB/s]
100% 17.5M/17.5M [00:00<00:00, 35.6MB/s]
Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
  inflating: test.csv                


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('./train.csv', index_col='id')
test = pd.read_csv('./test.csv', index_col='id')

In [4]:
train.head(3)

Unnamed: 0_level_0,book_title,book_image_url,book_desc,book_genre,book_authors,book_format,book_pages,book_review_count,book_rating_count,book_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Forastera,https://images.gr-assets.com/books/1500683049l...,"Recién acabada la Segunda Guerra Mundial, una ...",Historical|Historical Fiction|Romance|Fantasy|...,Diana Gabaldon|Carmen Bordeu,Kindle Edition,768 pages,40197,668892,4.22
1,Writing about Magic,https://images.gr-assets.com/books/1445900480l...,Do you write fantasy fiction? This book is a r...,Language|Writing|Nonfiction,Rayne Hall,Paperback,180 pages,27,126,3.95
2,The Stress of Her Regard,https://images.gr-assets.com/books/1503059955l...,When Michael Crawford discovers his bride brut...,Fantasy|Horror|Paranormal|Vampires|Historical|...,Tim Powers,Paperback,470 pages,331,3626,3.79


In [5]:
train['book_genre'] = train['book_genre'].fillna('nan')
test['book_genre'] = test['book_genre'].fillna('nan')

In [6]:
def get_genres(df):
  genres = list(df['book_genre'])
  genres = list(map(lambda x: x.split('|'), genres))
  genres = [item for sublist in genres for item in sublist]
  genres = np.array(genres)
  genres_unique, unique_counts = np.unique(genres, return_counts=True)
  return genres, dict(zip(genres_unique, unique_counts))

In [7]:
train_genres, train_genres_unique = get_genres(train)

In [8]:
def process(init_df, genres_unique, threshold = 2000):
  df = init_df.copy()
  top_genres = [g for g in genres_unique.keys() if genres_unique[g] > threshold] + ['Other']
  for genre in top_genres:
        df[f'genre_({genre})'] = np.zeros((len(df)))
  for i in range(len(df)):
    genres_list = df.iloc[i]['book_genre'].split('|')
    for genre in genres_list:
      if genre in top_genres:
        df.at[i, f'genre_({genre})'] = 1
  df['book_pages'] = df['book_pages'].astype(str).apply(lambda x: int(x.split()[0]) if x.split()[0] != 'nan' else 0)
  df = df.drop(columns=['book_genre', 'book_image_url'])
  return df

In [9]:
threshold = 2000

train_processed = train.copy() 
train_processed = process(train_processed, train_genres_unique)

test_processed = test.copy() 
test_processed = process(test_processed, train_genres_unique)

In [10]:
train_processed['book_format'] = train_processed['book_format'].fillna('nan')
test_processed['book_format'] = test_processed['book_format'].fillna('nan')

In [11]:
formats = ['soft_cover', 'hard_cover', 'ebook', 'audio', 'other']
def proc_formats(init_df, formats):
  df = init_df.copy()
  for format in formats:
        df[f'format_({format})'] = np.zeros((len(df)))
  for i in range(len(df)):
    formats_list = df.iloc[i]['book_format']
    if any([format in formats_list.lower() for format in ['paper', 'soft']]):
      df.at[i, 'format_(soft_cover)'] = 1
    elif any([format in formats_list.lower() for format in ['hard']]):
      df.at[i, 'format_(hard_cover)'] = 1
    elif any([format in formats_list.lower() for format in ['ebook', 'kindle', 'nook', 'digital']]):
      df.at[i, 'format_(ebook)'] = 1
    elif any([format in formats_list.lower() for format in ['audio', 'cd']]):
      df.at[i, 'format_(audio)'] = 1
    else:
      df.at[i, 'format_(other)'] = 1
  df = df.drop(columns=['book_format'])
  return df

In [12]:
train_processed = proc_formats(train_processed, formats)
test_processed = proc_formats(test_processed, formats)

In [13]:
train_processed['authors_number'] = train_processed['book_authors'].apply(lambda x: len(x.split('|')))
train_processed = train_processed.drop(columns=['book_authors'])
test_processed['authors_number'] = test_processed['book_authors'].apply(lambda x: len(x.split('|')))
test_processed = test_processed.drop(columns=['book_authors'])

In [14]:
train_processed.head(3)

Unnamed: 0_level_0,book_title,book_desc,book_pages,book_review_count,book_rating_count,book_rating,genre_(Biography),genre_(Childrens),genre_(Classics),genre_(Contemporary),genre_(Cultural),genre_(European Literature),genre_(Fantasy),genre_(Fiction),genre_(Historical),genre_(Historical Fiction),genre_(History),genre_(Literature),genre_(Mystery),genre_(Nonfiction),genre_(Paranormal),genre_(Religion),genre_(Romance),genre_(Science Fiction),genre_(Sequential Art),genre_(Thriller),genre_(Young Adult),genre_(nan),genre_(Other),format_(soft_cover),format_(hard_cover),format_(ebook),format_(audio),format_(other),authors_number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,Forastera,"Recién acabada la Segunda Guerra Mundial, una ...",768,40197,668892,4.22,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2
1,Writing about Magic,Do you write fantasy fiction? This book is a r...,180,27,126,3.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,The Stress of Her Regard,When Michael Crawford discovers his bride brut...,470,331,3626,3.79,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1


In [15]:
train_processed['title_len'] = train_processed['book_title'].apply(lambda x: len(x.split(' ')))
train_processed = train_processed.drop(columns=['book_desc', 'book_title'])
test_processed['title_len'] = test_processed['book_title'].apply(lambda x: len(x.split(' ')))
test_processed = test_processed.drop(columns=['book_desc', 'book_title'])

In [16]:
train_processed

Unnamed: 0_level_0,book_pages,book_review_count,book_rating_count,book_rating,genre_(Biography),genre_(Childrens),genre_(Classics),genre_(Contemporary),genre_(Cultural),genre_(European Literature),genre_(Fantasy),genre_(Fiction),genre_(Historical),genre_(Historical Fiction),genre_(History),genre_(Literature),genre_(Mystery),genre_(Nonfiction),genre_(Paranormal),genre_(Religion),genre_(Romance),genre_(Science Fiction),genre_(Sequential Art),genre_(Thriller),genre_(Young Adult),genre_(nan),genre_(Other),format_(soft_cover),format_(hard_cover),format_(ebook),format_(audio),format_(other),authors_number,title_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
0,768,40197,668892,4.22,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,1
1,180,27,126,3.95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3
2,470,331,3626,3.79,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,5
3,106,28,371,3.85,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,6
4,110,250,4331,3.72,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,491,24661,165284,4.46,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3
39996,48,130,1556,4.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,3
39997,416,3,159,4.20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,12
39998,336,645,2955,3.62,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,5


In [17]:
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE 

In [19]:
y = train_processed['book_rating']
X = train_processed.drop(columns = ['book_rating'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42) 

In [45]:
xgb_r = xg.XGBRegressor(objective='reg:squarederror', 
                  n_estimators=1024, seed=42) 
xgb_r.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1024,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, silent=None, subsample=1, verbosity=1)

In [46]:
np.sqrt(MSE(y_val, xgb_r.predict(X_val)))

0.2950209659286718

In [51]:
from sklearn.model_selection import GridSearchCV

In [53]:
clf = GridSearchCV(xg.XGBRegressor(objective='reg:squarederror'),
                   {'max_depth': [3, 4, 5], 'n_estimators': [128, 256, 512, 1024]})
clf.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:squarederror',
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5],
                         'n_estimators': [128, 256, 512, 1024]},
         

In [55]:
clf.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=256,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [56]:
np.sqrt(MSE(y_val, clf.predict(X_val)))

0.2944671309217966

In [65]:
sub = pd.read_csv('./submission.csv', index_col = 'id')
sub['book_rating'] = clf.predict(test_processed)
sub.to_csv('./my_sub_xgb_2.csv')

In [66]:
from joblib import dump

In [67]:
dump(clf, 'our_best_model.joblib') 

['our_best_model.joblib']

In [68]:
clf3 = xg.XGBRegressor(objective='reg:linear', 
                  n_estimators=220, seed=42) 
clf3.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=220,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=1, verbosity=1)

In [69]:
np.sqrt(MSE(y_val, clf3.predict(X_val)))

0.29662018640156573

In [72]:
clf.best_estimator_.save_model('did_our_best.model')

In [74]:
clf.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=256,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [75]:
clf_load = xg.XGBRegressor()
clf_load.load_model('./did_our_best.model')

In [77]:
clf_load.predict(X_val)

array([3.8501601, 4.0649414, 3.993217 , ..., 4.06656  , 3.7473824,
       4.1552896], dtype=float32)

In [78]:
clf.predict(X_val)

array([3.8501601, 4.0649414, 3.993217 , ..., 4.06656  , 3.7473824,
       4.1552896], dtype=float32)

In [79]:
np.sqrt(MSE(y_val, clf.predict(X_val)))

0.2944671309217966