# Libraries

In [1]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn import model_selection
from sklearn import preprocessing
datasetfinal="DatasetFinal.xlsx"
datasetstring ="Dataset.xlsx"
datasetfilledstring = "DataSetNotNull.xlsx"

In [2]:
dataset=pd.read_excel(datasetfinal)
dataset.columns

Index(['Book_ISBN', 'Book_Name', 'Book_Publisher', 'Book_Author', 'Book_Price',
       'Book_Page', 'Book_Publish_Date', 'Book_Rate', 'Book_Rating_Amount',
       'Book_Detail'],
      dtype='object')

In [3]:
dataset = dataset[["Book_ISBN", "Book_Name","Book_Publisher","Book_Author", "Book_Price","Book_Page","Book_Publish_Date","Book_Rate", "Book_Rating_Amount","Book_Detail"]]

### Age calculation of books by their Published Year

In [4]:
dataset['Age'] = 2023 - dataset['Book_Publish_Date']
dataset

Unnamed: 0,Book_ISBN,Book_Name,Book_Publisher,Book_Author,Book_Price,Book_Page,Book_Publish_Date,Book_Rate,Book_Rating_Amount,Book_Detail,Age
0,9781847941831,61,31,244,20.55,320,2018,4.38,508.702,6,5
1,9780857504791,557,130,463,33.86,416,2023,3.07,127.000,4,0
2,9781471156267,337,161,109,11.10,384,2016,4.34,1904.211,6,7
3,9781529029581,74,126,595,12.08,224,2019,3.75,178.831,6,4
4,9781398515697,532,161,574,10.87,400,2021,4.46,1711.333,6,2
...,...,...,...,...,...,...,...,...,...,...,...
930,9781405955027,239,128,476,14.60,688,2022,4.48,55.138,6,1
931,9781405297714,216,66,565,10.87,176,2019,4.12,9.562,6,4
932,9781405297042,326,66,565,11.51,464,2020,3.96,83.971,6,3
933,9781405291774,324,66,565,11.35,416,2018,4.45,284.364,6,5


### Dependent and Independent Variables

In [5]:
# Published_Date is dropped becasuse of the calculation of age. X is independent variables.
X = dataset.drop(["Book_Price","Book_Publisher","Book_Publish_Date"], axis = 1)
# Y is dependent variable
y = dataset["Book_Price"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [7]:
#Search parameters
params = {"colsample_bytree":[0.4,0.5,0.6],
         "learning_rate":[0.01,0.02,0.09],
         "max_depth":[2,3,4,5,6],
         "n_estimators":[100,200,500,2000]}

In [8]:
xgb = XGBRegressor()

In [9]:
grid = GridSearchCV(xgb, params, cv = 10, n_jobs = -1, verbose = 2)

In [10]:
#Find the best parameter for machine learning algorithm
grid.fit(X_train, y_train)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


In [11]:
#Getting Best fitting parameters
grid.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.02,
 'max_depth': 4,
 'n_estimators': 200}

In [12]:
#Appropriate parameter entry
xgb1 = XGBRegressor(colsample_bytree = 0.5, learning_rate = 0.02, max_depth = 4, n_estimators = 200)

In [13]:
#Fit the model
model_xgb = xgb1.fit(X_train, y_train)

In [14]:
# Shape of the data
X_train.shape,X_test.shape

((748, 8), (187, 8))

In [15]:
# Predict values
model_xgb.predict(X_test)[20:30]

array([35.820705, 14.31295 , 15.568503, 24.141945, 25.59559 , 15.728932,
       14.672403, 13.182094, 15.365872, 11.900545], dtype=float32)

In [16]:
# Actual values
y_test[20:30]

  y_test[20:30]


668    42.59
490    13.29
933    11.35
30     20.55
323    29.00
756    14.00
679    20.55
606    10.88
208    23.72
266     9.99
Name: Book_Price, dtype: float64

In [17]:
# Algorithm learning score.It takes a value between 0-1.
model_xgb.score(X_test, y_test)

0.747567845968035

In [18]:
model_xgb.score(X_train, y_train)

0.9054789241581829

### Test Error

In [19]:
mse=np.sqrt(-1*(cross_val_score(model_xgb, X_test, y_test, cv=10, scoring='neg_mean_squared_error'))).mean()
print('Mean square error: ' ,mse)

Mean square error:  12.313284710995877


In [20]:
import math
rmse = math.sqrt(mse)
print('Root mean square error: ', rmse)

Root mean square error:  3.5090290268101056


In [21]:
importance = pd.DataFrame({"Importance": model_xgb.feature_importances_},
                         index=X_train.columns)

In [22]:
importance
importance.to_excel("ImportanceofAttr.xlsx")