In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.stats import randint

import warnings
warnings.filterwarnings("ignore", message="^Unlike other reduction functions.*", category=FutureWarning)

In [43]:
dataset = pd.read_csv("dataset.csv") 
authors = pd.read_csv("authors.csv") 
categories = pd.read_csv("categories.csv") 
formats = pd.read_csv("formats.csv") 
places = pd.read_csv("places.csv")

In [44]:
dataset.head()


Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,edition,edition-statement,for-ages,...,isbn10,isbn13,lang,publication-date,publication-place,rating-avg,rating-count,title,url,weight
0,[1],49848.0,"[214, 220, 237, 2646, 2647, 2659, 2660, 2679]",SOLDIER FIVE is an elite soldier's explosive m...,129.0,198.0,20.0,,,,...,184018907X,9781840189070,en,2004-10-14 00:00:00,,4.03,292.0,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,224.0
1,"[2, 3]",115215.0,"[235, 3386]",John Moran and Carl Williams were the two bigg...,127.0,203.2,25.4,,,,...,184454737X,9781844547371,en,2009-03-13 00:00:00,,3.6,335.0,Underbelly : The Gangland War,/Underbelly-Andrew-Rule/9781844547371,285.76
2,[4],11732.0,"[358, 2630, 360, 2632]",Sir Phillip knew that Eloise Bridgerton was a ...,150.0,224.0,28.0,New edition,,,...,8416327866,9788416327867,es,2020-04-30 00:00:00,,3.88,37211.0,"A Sir Phillip, Con Amor",/Sir-Phillip-Con-Amor-Julia-Quinn/9788416327867,386.0
3,"[5, 6, 7, 8]",114379.0,"[377, 2978, 2980]",The Third Book of General Ignorance gathers t...,153.0,234.0,24.0,,Export - Airside ed,,...,571308996,9780571308996,en,2015-10-01 00:00:00,,4.17,384.0,QI: The Third Book of General Ignorance,/QI-Third-Book-General-Ignorance-John-Lloyd/97...,436.0
4,[9],98413.0,"[2813, 2980]",The Try Guys deliver their first book-an inspi...,191.0,240.0,29.0,,,,...,8352518,9780008352516,en,2019-06-18 00:00:00,,3.9,5095.0,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,980.0


In [45]:
authors.head()


Unnamed: 0,author_id,author_name
0,9561,
1,451324,# House Press
2,454250,# Petal Press
3,249724,#GARCIA MIGUELE
4,287710,#Worldlcass Media


In [46]:
categories.head()


Unnamed: 0,category_id,category_name
0,1998,.Net Programming
1,176,20th Century & Contemporary Classical Music
2,3291,20th Century & Contemporary Classical Music
3,2659,20th Century History: C 1900 To C 2000
4,2661,21st Century History: From C 2000 -


In [47]:

formats.head()


Unnamed: 0,format_id,format_name
0,21,Address
1,5,Audio
2,27,Bath
3,44,Big
4,14,Board


In [48]:
# Define a function to convert a list of ids to a list of names using the mappings
def id_to_name(id_list, mapping):
  # Convert the id_list from string to list
  id_list = eval(id_list)
  # Initialize an empty list to store the names
  name_list = []
  # Loop through each id in the id_list
  for id in id_list:
    # Try to find the corresponding name in the mapping using the id as the key
    try:
      name = mapping[id]
    # If the id is not found, catch the KeyError exception and assign a default value of null
    except KeyError:
      name = 'null'
    # Append the name to the name_list
    name_list.append(name)
  # Return the name_list as a string
  return str(name_list)


In [49]:

# Create new columns for authors_name, categories_name, and format_name using the id_to_name function and the mappings
dataset['authors_name'] = dataset['authors'].apply(lambda x: id_to_name(x, authors['author_name']))
dataset['categories_name'] = dataset['categories'].apply(lambda x: id_to_name(x, categories['category_name']))

# Drop the original columns for authors, categories, and format
dataset = dataset.drop(['authors', 'categories'], axis=1)

# Merge the main dataset and the format file on the format and format_id columns using a left join
dataset = pd.merge(dataset, formats, how='left', left_on='format', right_on='format_id')

# Rename the format_name column to format_name
dataset = dataset.rename(columns={'format_name': 'format_name'})

# Drop the original format and format_id columns
dataset = dataset.drop(['format', 'format_id'], axis=1)

# Save the modified dataset
# dataset.to_csv('dataset_modified.csv', index=False)

In [50]:
dataset.head()

Unnamed: 0,bestsellers-rank,description,dimension-x,dimension-y,dimension-z,edition,edition-statement,for-ages,id,illustrations-note,...,publication-date,publication-place,rating-avg,rating-count,title,url,weight,authors_name,categories_name,format_name
0,49848.0,SOLDIER FIVE is an elite soldier's explosive m...,129.0,198.0,20.0,,,,9781840189070,,...,2004-10-14 00:00:00,,4.03,292.0,Soldier Five : The Real Truth About The Bravo ...,/Soldier-Five-Mike-Coburn/9781840189070,224.0,['# House Press'],"['Aviation & Space Medicine', 'Baby Books', 'B...",Paperback
1,115215.0,John Moran and Carl Williams were the two bigg...,127.0,203.2,25.4,,,,9781844547371,,...,2009-03-13 00:00:00,,3.6,335.0,Underbelly : The Gangland War,/Underbelly-Andrew-Rule/9781844547371,285.76,"['# Petal Press', '#GARCIA MIGUELE']","['Baptist Churches', 'null']",Paperback
2,11732.0,Sir Phillip knew that Eloise Bridgerton was a ...,150.0,224.0,28.0,New edition,,,9788416327867,,...,2020-04-30 00:00:00,,3.88,37211.0,"A Sir Phillip, Con Amor",/Sir-Phillip-Con-Amor-Julia-Quinn/9788416327867,386.0,['#Worldlcass Media'],"['Cartoons & Comic Strips', 'True Stories for ...",Paperback
3,114379.0,The Third Book of General Ignorance gathers t...,153.0,234.0,24.0,,Export - Airside ed,,9780571308996,,...,2015-10-01 00:00:00,,4.17,384.0,QI: The Third Book of General Ignorance,/QI-Third-Book-General-Ignorance-John-Lloyd/97...,436.0,"['#shakeback Publishing', '& Rueckert Elkins...","['Chakras, Auras & Spiritual Energy', 'null', ...",Paperback
4,98413.0,The Try Guys deliver their first book-an inspi...,191.0,240.0,29.0,,,,9780008352516,,...,2019-06-18 00:00:00,,3.9,5095.0,The Hidden Power of F*cking Up,/Hidden-Power-F-cking-Up-Try-Guys/9780008352516,980.0,['& Bonchek Shepsle & Bonchek'],"['null', 'null']",Hardback


In [51]:
print("The dataset has", dataset.shape[0], "rows and", dataset.shape[1], "columns.")

The dataset has 1109383 rows and 28 columns.


In [52]:
print("The columns and their data types are:") 
print(dataset.dtypes)

The columns and their data types are:
bestsellers-rank      float64
description            object
dimension-x           float64
dimension-y           float64
dimension-z           float64
edition                object
edition-statement      object
for-ages               object
id                      int64
illustrations-note     object
image-checksum         object
image-path             object
image-url              object
imprint                object
index-date            float64
isbn10                 object
isbn13                  int64
lang                   object
publication-date       object
publication-place     float64
rating-avg            float64
rating-count          float64
title                  object
url                    object
weight                float64
authors_name           object
categories_name        object
format_name            object
dtype: object


In [53]:

print("The number of null values in each column are:") 
print(dataset.isnull().sum())

The number of null values in each column are:
bestsellers-rank       466842
description             80087
dimension-x             48227
dimension-y             93531
dimension-z             48227
edition                926569
edition-statement      747261
for-ages              1033390
id                          0
illustrations-note     752907
image-checksum             27
image-path                 27
image-url                  27
imprint                830049
index-date            1109383
isbn10                      0
isbn13                      0
lang                    60407
publication-date         2603
publication-place     1109383
rating-avg             440130
rating-count           440130
title                       0
url                         0
weight                  87173
authors_name                0
categories_name             0
format_name              6622
dtype: int64


In [54]:
print("The summary of each column is:") 
print(dataset.describe())

The summary of each column is:
       bestsellers-rank   dimension-x   dimension-y   dimension-z  \
count      6.425410e+05  1.061156e+06  1.015852e+06  1.061156e+06   
mean       9.209777e+05  1.607659e+02  2.249102e+02  2.644117e+01   
std        8.642210e+05  3.802671e+01  4.347322e+01  4.779869e+01   
min        1.001000e+03  2.500000e-01  1.000000e+00  1.000000e-01   
25%        1.654890e+05  1.380000e+02  2.030000e+02  9.000000e+00   
50%        6.646830e+05  1.520000e+02  2.290000e+02  1.575000e+01   
75%        1.455812e+06  1.780000e+02  2.420000e+02  2.500000e+01   
max        3.679776e+06  3.871000e+03  2.000000e+03  2.000000e+03   

                 id  index-date        isbn13  publication-place  \
count  1.109383e+06         0.0  1.109383e+06                0.0   
mean   9.781658e+12         NaN  9.781658e+12                NaN   
std    1.747523e+09         NaN  1.747523e+09                NaN   
min    9.780000e+12         NaN  9.780000e+12                NaN   
25%    

In [55]:
print("The correlation matrix of the dataset is:") 
print(dataset.corr())

The correlation matrix of the dataset is:


  print(dataset.corr())


                   bestsellers-rank  dimension-x  dimension-y  dimension-z  \
bestsellers-rank           1.000000     0.060753     0.052163     0.032632   
dimension-x                0.060753     1.000000     0.804573     0.129267   
dimension-y                0.052163     0.804573     1.000000     0.022752   
dimension-z                0.032632     0.129267     0.022752     1.000000   
id                         0.181542     0.024287     0.037383     0.128227   
index-date                      NaN          NaN          NaN          NaN   
isbn13                     0.181542     0.024287     0.037383     0.128227   
publication-place               NaN          NaN          NaN          NaN   
rating-avg                -0.103340     0.051849     0.043008     0.026560   
rating-count              -0.054883    -0.050008    -0.046098     0.014729   
weight                     0.059017     0.240304     0.249185     0.169369   

                         id  index-date    isbn13  publication-

In [61]:
dataset = dataset.dropna(subset=['lang'])

# dataset = dataset.drop(['image-checksum'], axis=1)
# dataset = dataset.drop(['image-path'], axis=1)
# dataset = dataset.drop(['image-url'], axis=1)
# dataset = dataset.drop(['index-date'], axis=1)
# dataset = dataset.drop(['publication-date'], axis=1)
# dataset = dataset.drop(['publication-place'], axis=1)
# dataset = dataset.drop(['url'], axis=1)
# dataset = dataset.drop(['title'], axis=1)   
# dataset = dataset.drop(['description'], axis=1)
# dataset = dataset.drop(['id'], axis=1)
# dataset = dataset.drop(['edition'], axis=1)
# dataset = dataset.drop(['edition-statement'], axis=1)
# dataset = dataset.drop(['for-ages'], axis=1)
# dataset = dataset.drop(['illustrations-note'], axis=1)
# dataset = dataset.drop(['imprint'], axis=1)
dataset = dataset.drop(['rating-avg'], axis=1)
dataset = dataset.drop(['rating-count'], axis=1)
dataset = dataset.drop(['bestsellers-rank'], axis=1)


# Fill NaN values with the mean of the column
dataset['dimension-x'].fillna(dataset['dimension-x'].mean(), inplace=True)
dataset['dimension-x'] = dataset['dimension-x'].replace([np.inf, -np.inf], np.nan)
dataset['dimension-x'].fillna(dataset['dimension-x'].mean(), inplace=True)


dataset['dimension-y'].fillna(dataset['dimension-y'].mean(), inplace=True)
dataset['dimension-y'] = dataset['dimension-y'].replace([np.inf, -np.inf], np.nan)
dataset['dimension-y'].fillna(dataset['dimension-y'].mean(), inplace=True)

dataset['dimension-z'].fillna(dataset['dimension-z'].mean(), inplace=True)
dataset['dimension-z'] = dataset['dimension-z'].replace([np.inf, -np.inf], np.nan)
dataset['dimension-z'].fillna(dataset['dimension-z'].mean(), inplace=True)

dataset['weight'].fillna(dataset['weight'].mean(), inplace=True)
dataset['weight'] = dataset['weight'].replace([np.inf, -np.inf], np.nan)
dataset['weight'].fillna(dataset['weight'].mean(), inplace=True)





In [57]:
print("The number of null values in each column are:") 
print(dataset.isnull().sum())


The number of null values in each column are:
bestsellers-rank    415869
dimension-x              0
dimension-y              0
dimension-z              0
isbn10                   0
isbn13                   0
lang                     0
rating-avg          396083
rating-count        396083
weight               48649
authors_name             0
categories_name          0
format_name           2027
dtype: int64


In [62]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('lang', axis=1), dataset['lang'], test_size=0.2, random_state=42)

# Define numerical attributes and scaler
num_attrs = ['dimension-x', 'dimension-y', 'dimension-z', 'weight']
scaler = StandardScaler()

# Define categorical attributes and encoder
cat_attrs = ['authors_name', 'categories_name', 'format_name', 'isbn10', 'isbn13']
encoder = OneHotEncoder(handle_unknown='ignore')

# Define column transformer to apply different transformations to different columns
col_transformer = ColumnTransformer([
    ('num', scaler, num_attrs),
    ('cat', encoder, cat_attrs),
])

In [63]:
# Build classification models
log_reg = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()

# Define pipelines to chain together preprocessing and classifier
log_reg_pipeline = Pipeline([
    ('preprocessing', col_transformer),
    ('log_reg', log_reg)
])

dtree_pipeline = Pipeline([
    ('preprocessing', col_transformer),
    ('dtree', dtree)
])

rf_pipeline = Pipeline([
    ('preprocessing', col_transformer),
    ('rf', rf)
])

# Define hyperparameters and their possible values for each classifier
log_reg_params = {'log_reg__C': [0.1, 1, 10], 'log_reg__penalty': ['l1', 'l2']}
dtree_params = {'dtree__max_depth': [5, 10, 15], 'dtree__min_samples_split': [2, 5, 10]}
rf_params = {'rf__n_estimators': [100, 200, 300], 'rf__max_features': ['auto', 'sqrt']}

# Define the number of iterations for each random search
n_iter_log_reg = 5
n_iter_dtree = 5
n_iter_rf = 5


In [64]:
#logistic regression
log_reg_random_search = RandomizedSearchCV(log_reg_pipeline, log_reg_params, n_iter=n_iter_log_reg, cv=5, random_state=42)
log_reg_random_search.fit(X_train, y_train)

25 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [65]:

dtree_rand = RandomizedSearchCV(dtree_pipeline, dtree_params, n_iter=n_iter_dtree, cv=5, scoring='accuracy', refit=True, verbose=1)
dtree_rand.fit(X_train, y_train)


Fitting 5 folds for each of 5 candidates, totalling 25 fits




In [None]:

rf_rand = RandomizedSearchCV(rf_pipeline, rf_params, n_iter=n_iter_rf, cv=5, scoring='accuracy', refit=True, verbose=1)
rf_rand.fit(X_train_transformed, y_train)

In [None]:
#Logistic Regression
y_pred_log_reg = log_reg_grid.predict(X_test)
acc_log_reg = accuracy_score(y_test, y_pred_log_reg)
prec_log_reg = precision_score(y_test, y_pred_log_reg, average='macro')
rec_log_reg = recall_score(y_test, y_pred_log_reg, average='macro')
f1_log_reg = f1_score(y_test, y_pred_log_reg, average='macro')
print('Logistic Regression:')
print('Accuracy:', acc_log_reg)
print('Precision:', prec_log_reg)
print('Recall:', rec_log_reg)
print('F1-score:', f1_log_reg)


In [None]:
#Decision Tree
acc_dtree = accuracy_score(y_test, y_pred_dtree)
y_pred_dtree = dtree_grid.predict(X_test)
prec_dtree = precision_score(y_test, y_pred_dtree, average='macro')
rec_dtree = recall_score(y_test, y_pred_dtree, average='macro')
f1_dtree = f1_score(y_test, y_pred_dtree, average='macro')
print('Decision Tree:')
print('Accuracy:', acc_dtree)
print('Precision:', prec_dtree)
print('Recall:', rec_dtree)
print('F1-score:', f1_dtree)


In [None]:
#Random Forest
acc_rf = accuracy_score(y_test, y_pred_rf)
y_pred_rf = rf_grid.predict(X_test)
prec_rf = precision_score(y_test, y_pred_rf, average='macro')
rec_rf = recall_score(y_test, y_pred_rf, average='macro')
f1_rf = f1_score(y_test, y_pred_rf, average='macro')
print('Random Forest:')
print('Accuracy:', acc_rf)
print('Precision:', prec_rf)
print('Recall:', rec_rf)
print('F1-score:', f1_rf)


In [None]:
#KNN
y_pred_knn = knn_grid.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
prec_knn = precision_score(y_test, y_pred_knn, average='macro')
rec_knn = recall_score(y_test, y_pred_knn, average='macro')
f1_knn = f1_score(y_test, y_pred_knn, average='macro')
print('K-Nearest Neighbors:')
print('Accuracy:', acc_knn)
print('Precision:', prec_knn)
print('Recall:', rec_knn)
