## Predicting Book Success: SVM model

In [36]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, ShuffleSplit
from sklearn.svm import SVC

In [37]:
merged=pd.read_csv("merged2_50K.csv")

In [38]:
merged.columns

Index(['Unnamed: 0', 'isbn', 'books_reviews_count', 'series', 'country_code',
       'language_code', 'asin', 'is_ebook', 'books_average_rating',
       'kindle_asin', 'similar_books', 'description', 'format', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year',
       'book_id', 'books_ratings_count', 'work_id', 'title',
       'title_without_series', 'author_id', 'authors_average_rating',
       'authors_text_reviews_count', 'name', 'authors_ratings_count', 'genres',
       'fiction_or_nonFiction', 'series or not', 'log_weighted_rating',
       'total_books', 'pgs_per_book'],
      dtype='object')

In [40]:
merged_to_model=merged[['title','description',"books_ratings_count", "books_reviews_count"
                            ,'num_pages', 'log_weighted_rating',
                           "fiction_or_nonFiction",'series or not', "total_books"]]

In [41]:
#categorical data
categorical_cols = ['series or not','fiction_or_nonFiction'] 

#numerical data- get dummy variables for categorical columns
numerics_data = pd.get_dummies(merged_to_model, columns = categorical_cols)

In [42]:
numerics_data.dropna(inplace= True)
numerics_data.shape

(30747, 11)

In [43]:

X = numerics_data.drop(['log_weighted_rating'], axis = 1).reset_index(drop = True)
y = numerics_data['log_weighted_rating'].reset_index(drop = True)

In [44]:
def clean_array(arr):
    indices_to_keep = ~arr.isin([np.nan, np.inf, -np.inf]) 
    return (arr[indices_to_keep], indices_to_keep)

In [45]:
#go through each value in log_weighted_rating and if the value is nan, negative infinity,
#or infinity remove and dont keep as a indice

y, indices_to_keep = clean_array(y)

In [46]:
X = X.iloc[indices_to_keep.index[indices_to_keep==True],:]

In [48]:
classes = pd.cut(y, bins=3, labels=[1,2,3])

y = np.array(classes)

In [49]:
X=np.array(X)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [51]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Mohammedkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/Mohammedkhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
def get_title(x):
    
    return [record[0] for record in x]
def get_description(x):
  
    return [record[1] for record in x]
def get_numeric_data(x):
   
    return [record[3:].astype(float) for record in x]
    

transformer_numeric_data = FunctionTransformer(get_numeric_data, validate=False)
transformer_title = FunctionTransformer(get_title, validate=False)
transformer_description = FunctionTransformer(get_description, validate=False)

vectorizer = CountVectorizer(max_features=2000, tokenizer= word_tokenize)

In [53]:
pipeline = Pipeline([
    ("features", FeatureUnion([
        ("description", Pipeline([
            ('selector', transformer_description),
            ('vec',vectorizer)
        ])),
        ("title", Pipeline([
            ('selector', transformer_title),
            ('vec',vectorizer)
        ])),
        ("numeric_features", Pipeline([
            ('selector', transformer_numeric_data)
        ]))
    ])),
    ('scale', StandardScaler(with_mean = False)),
 
    ("svm", SVC(kernel = 'poly', degree = 3))
])

In [54]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('description', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function get_description at 0x1a23583d90>,
          inv_kw_args=None, inverse_func=None,...y', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [59]:
aa = pipeline.score(X_test,y_test)

In [60]:
aa

0.7058131939908556

In [61]:
np.save("aa.py",aa)