## Predicting Book Success - Decision Tree Modeling

1. Import Packages

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ShuffleSplit
from sklearn.metrics import classification_report


2. Read in the exported cleaned dataset, this can also be accessed from Github repository

In [3]:
merged=pd.read_csv("merged2_50K.csv")


In [5]:
merged.columns

Index(['Unnamed: 0', 'isbn', 'books_reviews_count', 'series', 'country_code',
       'language_code', 'asin', 'is_ebook', 'books_average_rating',
       'kindle_asin', 'similar_books', 'description', 'format', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year',
       'book_id', 'books_ratings_count', 'work_id', 'title',
       'title_without_series', 'author_id', 'authors_average_rating',
       'authors_text_reviews_count', 'name', 'authors_ratings_count', 'genres',
       'fiction_or_nonFiction', 'series or not', 'log_weighted_rating',
       'total_books', 'pgs_per_book'],
      dtype='object')

3. Choose feature and target variables: title and description will be vectorized.

In [6]:
# feature variables we want to use in the model:


merged_to_model=merged[['title','description',"books_ratings_count", "books_reviews_count"
                            ,'num_pages', 'log_weighted_rating',
                           "fiction_or_nonFiction",'series or not',"author_id", "total_books"]]

In [7]:
merged_to_model.columns

Index(['title', 'description', 'books_ratings_count', 'books_reviews_count',
       'num_pages', 'log_weighted_rating', 'fiction_or_nonFiction',
       'series or not', 'author_id', 'total_books'],
      dtype='object')

4. Get dummy variables, define categorical columns

In [8]:
#categorical data
categorical_cols = [ "author_id",'series or not','fiction_or_nonFiction'] 

#numerical data- get dummy variables for categorical columns
numerics_data = pd.get_dummies(merged_to_model, columns = categorical_cols)

In [9]:
numerics_data.dropna(inplace= True)
numerics_data.shape

(30747, 24949)

In [10]:
numerics_data[:5]

Unnamed: 0,title,description,books_ratings_count,books_reviews_count,num_pages,log_weighted_rating,total_books,author_id_4,author_id_7,author_id_10,...,author_id_17250134,author_id_17255261,author_id_17264133,author_id_17265337,author_id_17290483,author_id_17300081,series or not_0,series or not_1,fiction_or_nonFiction_0,fiction_or_nonFiction_1
2,The Red Tent,IN THE RED TENT--NOW A MAJOR LIFETIME MINISERI...,134,19,352.0,6.323355,5,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,The Red Tent,The red tent is the place where women gathered...,460,60,336.0,7.556742,5,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,"Saying Kaddish: How to Comfort the Dying, Bury...","Anita Diamant's knowledge, sensitivity, and cl...",107,14,288.0,6.098344,5,0,0,0,...,0,0,0,0,0,0,1,0,0,1
6,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,140,7,600.0,6.335409,5,0,0,0,...,0,0,0,0,0,0,0,1,1,0
7,Bride of the Rat God,Chrysanda Flamande was the sultriest vamp of t...,689,65,336.0,7.85966,5,0,0,0,...,0,0,0,0,0,0,1,0,1,0


5. Set the features variable (X), and the target variable/predictor (y)

In [11]:
# X is features variables, y is the target variable (predictor)

X = numerics_data.drop(['log_weighted_rating'], axis = 1).reset_index(drop = True)
y = numerics_data['log_weighted_rating'].reset_index(drop = True)

In [12]:
X[:5]

Unnamed: 0,title,description,books_ratings_count,books_reviews_count,num_pages,total_books,author_id_4,author_id_7,author_id_10,author_id_14,...,author_id_17250134,author_id_17255261,author_id_17264133,author_id_17265337,author_id_17290483,author_id_17300081,series or not_0,series or not_1,fiction_or_nonFiction_0,fiction_or_nonFiction_1
0,The Red Tent,IN THE RED TENT--NOW A MAJOR LIFETIME MINISERI...,134,19,352.0,5,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,The Red Tent,The red tent is the place where women gathered...,460,60,336.0,5,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,"Saying Kaddish: How to Comfort the Dying, Bury...","Anita Diamant's knowledge, sensitivity, and cl...",107,14,288.0,5,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",Omnibus book club edition containing the Ladie...,140,7,600.0,5,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,Bride of the Rat God,Chrysanda Flamande was the sultriest vamp of t...,689,65,336.0,5,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [16]:
y.shape

(30747,)

6. Clean out the NaN's, infinity, and neg infinity values in log weighted rating. 

In [17]:
#go through each value in log_weighted_rating and if the value is nan, negative infinity,
#or infinity remove and dont keep as a indice

def clean_array(arr):
    indices_to_keep = ~arr.isin([np.nan, np.inf, -np.inf]) 
    return (arr[indices_to_keep], indices_to_keep)

In [18]:


y, indices_to_keep = clean_array(y)

In [19]:
y.shape

(30619,)

In [20]:
#match the x indices to the y indices

X = X.iloc[indices_to_keep.index[indices_to_keep==True],:]

In [22]:
X.shape

(30619, 24948)

7. Divide scores into classes: 1 - bad, 2- average,  3- above average, 4 - good,  5- excellent

In [23]:

classes = pd.cut(y, bins=5, labels=[1,2,3,4,5])

y = np.array(classes)

In [24]:
y

array([3, 3, 3, ..., 2, 1, 2])

In [25]:
X=np.array(X)

In [None]:
X

8. Now that X and y are prepped, split into training and test datasets. 

In [17]:
#https://stackoverflow.com/questions/28064634/random-state-pseudo-random-number-in-scikit-learn

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Mohammedkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Mohammedkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

9. Set up functions to vectorize title and description in pipeline, get numeric data

In [19]:
#functions to get numeric data and vectorize title and description variables



def get_title(x):
    
    return [record[0] for record in x]
def get_description(x):
  
    return [record[1] for record in x]
def get_numeric_data(x):
   
    return [record[3:].astype(float) for record in x]
    

transformer_numeric_data = FunctionTransformer(get_numeric_data, validate=False)
transformer_title = FunctionTransformer(get_title, validate=False)
transformer_description = FunctionTransformer(get_description, validate=False)

vectorizer = CountVectorizer(max_features=2000, tokenizer= word_tokenize)

In [20]:
#Model structure


pipeline = Pipeline([
    ("features", FeatureUnion([
        ("description", Pipeline([
            ('selector', transformer_description),
            ('vec',vectorizer)
        ])),
        ("title", Pipeline([
            ('selector', transformer_title),
            ('vec',vectorizer)
        ])),
        ("numeric_features", Pipeline([
            ('selector', transformer_numeric_data)
        ]))
    ])),
    ('scale', StandardScaler(with_mean = False)),
    
    ("dt", DecisionTreeClassifier())
])





In [45]:
bb =pipeline.fit(X_train,y_train)

10. Pipeline score and accuracy result in total number of predictions correctly predicted

In [49]:

cc =pipeline.score(X_test,y_test)
cc

0.6568256041802744

In [48]:
np.save("cc.py", cc)