# Modeling

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate

In [3]:
model_path = r"C:\Users\adame\OneDrive\Documents\GitHub\springboard_repository\Capstone_books\Capstone_books\models\book_rating_prediction_model.pkl"
with open(model_path, 'rb') as f:
    model = pickle.load(f)

In [4]:
books_data = pd.read_csv(r"C:\Users\adame\OneDrive\Documents\GitHub\springboard_repository\Capstone_books\Capstone_books\data\processed\cleaned_processed_books.csv")

### Intro

In [6]:
# get a random sample from the books dataframe and remove the sample from the original dataset
sample = books_data.sample(n=1000)
books_data.drop(sample.index, inplace=True)

In [7]:
sample

Unnamed: 0,bookID,title,authors,isbn,isbn13,language_code,num_pages,publication_date,publisher,average_rating_mean,...,language_code_nl,language_code_nor,language_code_por,language_code_rus,language_code_spa,language_code_swe,language_code_tur,language_code_zho,publisher_encoded,genres_encoded
141,324,Cien años de soledad,Gabriel García Márquez,0785950109,9780785950103,spa,448,1990-01-01,French & European,4.07,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.957987,3.970609
3356,12646,On Democracy & Education (Social Theory Educa...,Noam Chomsky/C.P. Otero,0415926327,9780415926324,en-US,480,2002-11-22,Routledge Falmer,3.96,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.940449,3.940449
2064,7681,Five Patients,Michael Crichton,0345354648,9780345354648,eng,204,1989-01-13,Ballantine Books,3.48,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.865660,3.872624
4522,16948,Children of God (The Sparrow #2),Mary Doria Russell,044900483X,9780449004838,eng,451,1999-02-02,Ballantine Books,4.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.865660,3.953460
6256,24777,Count Karlstein,Philip Pullman/Diana Bryan,0375803483,9780375803482,eng,256,2000-02-22,Yearling,3.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.910155,3.880599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5673,22364,JLA: Earth 2,Grant Morrison/Frank Quitely,1563896311,9781563896316,eng,96,2000-10-01,DC Comics,3.97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.833619,3.942131
9605,40996,The Wizard (The Wizard Knight #2),Gene Wolfe,0765350505,9780765350503,eng,587,2006-08-29,Tor Fantasy,3.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.852357,3.927438
5347,20800,Conversations with Don DeLillo,Don DeLillo/Thomas DePietro,1578067049,9781578067046,en-US,183,2005-01-13,University Press of Mississippi,4.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.962338,3.953460
3262,12328,How To Draw The Legend Of Zelda (troll),Michael Teitelbaum/Ron Zalme,0439635810,9780439635813,en-US,32,2004-02-01,Scholastic,4.21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.978080,3.972976


### Refit the model with all data minus the sample

In [9]:
# split the data to refit the model
X = books_data[model.X_columns]
y = books_data['average_rating_mean']
len(X), len(y)

(9533, 9533)

In [10]:
model.fit(X, y)

In [11]:
# cross validate the model and score
cv_results = cross_validate(model, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [12]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(0.030568706656756296, 0.0042075819206639975)

In [13]:
# r-squared score
model.score(X, y)

0.9832031311473424

## Predict sample book ratings

In [15]:
# title in the sample
titles_list = list(sample.title)

In [16]:
# split the sample data for predictions
X_samp = sample.loc[sample.title == titles_list, model.X_columns]
y_samp = sample.loc[sample.title == titles_list, 'average_rating_mean']

In [17]:
samp_pred = model.predict(X_samp)

### Compare the predicted values against the actual values

In [50]:
compare = pd.DataFrame({'Actual':y_samp, 'Predicted':samp_pred, 'Difference':round(y_samp - samp_pred, 4)})
compare.sort_values(by='Difference', axis=0, ascending=False, inplace=True)
compare

Unnamed: 0,Actual,Predicted,Difference
2238,4.50,3.485740,1.0143
792,4.67,4.150244,0.5198
7720,4.15,3.631667,0.5183
1578,3.90,3.408504,0.4915
3832,4.20,3.813366,0.3866
...,...,...,...
4467,3.00,3.996163,-0.9962
10502,2.33,3.389203,-1.0592
5396,2.00,3.228309,-1.2283
6206,2.67,4.233740,-1.5637


In [98]:
compare['Difference'].mean()

-0.008052499999999999

#### The max difference in prediction was 1.01 and minimum was -2.04. The mean difference was -0.008.