In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv('data/books-dataset.zip', encoding='latin-1', index_col=0)
df.head()

Unnamed: 0,Title,Author,Genre
0,The Walking Dead Compendium Volume 3 (Walking ...,Robert Kirkman,Comics & Graphic Novels
1,Saga Volume 5 (Saga Tp),Brian K. Vaughan,Comics & Graphic Novels
2,The Walking Dead: Compendium One,Robert Kirkman,Comics & Graphic Novels
3,"Batman: The Killing Joke, Deluxe Edition",Alan Moore,Comics & Graphic Novels
4,The Walking Dead: Compendium Two,Robert Kirkman,Comics & Graphic Novels


In [31]:
books = df[['Title', 'Author']]
genres = df[['Genre']]

In [48]:
books = books.fillna('No Book')

# Tag (Genre) Treatment

In [9]:
from sklearn.preprocessing import LabelEncoder

In [49]:
genre_encoder = LabelEncoder()
genre_encoder.fit(np.ravel(genres))
encoded_genres = genre_encoder.transform(np.ravel(genres))

In [50]:
encoded_genres

array([ 3,  3,  3, ..., 14, 14, 14], dtype=int64)

In [56]:
genre_encoder.inverse_transform(encoded_genres[0])

'Comics & Graphic Novels'

In [93]:
y = encoded_genres

# Feature Treatment

In [335]:
features = pd.DataFrame()

In [97]:
author_encoder = LabelEncoder()
author_encoder.fit(np.ravel(books.Author))
encoded_authors = author_encoder.transform(np.ravel(books.Author))

In [336]:
#features['encoded_author'] = encoded_authors

In [99]:
author_encoder.inverse_transform(encoded_authors[0])

'Robert Kirkman'

In [340]:
features['book_info'] = books.Author + ' ' + books.Title

In [341]:
from nltk.corpus import stopwords
english_stopwords = list(stopwords.words('english'))

In [101]:
def format(title, stopwords=english_stopwords):
    split_title = title.split()
    return ' '.join([(x) for x in split_title if x not in stopwords])

In [342]:
features['book_info'] = features.book_info.apply(format)

In [343]:
features.head()

Unnamed: 0,book_info
0,Robert Kirkman The Walking Dead Compendium Vol...
1,Brian K. Vaughan Saga Volume 5 (Saga Tp)
2,Robert Kirkman The Walking Dead: Compendium One
3,"Alan Moore Batman: The Killing Joke, Deluxe Ed..."
4,Robert Kirkman The Walking Dead: Compendium Two


In [344]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [346]:
vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'\w+', ngram_range=(1,1), min_df=1,
                             strip_accents='unicode', max_features=25000, binary=True, sublinear_tf=True)
vectors = vectorizer.fit_transform(features.book_info)

In [347]:
vectors

<101061x25000 sparse matrix of type '<class 'numpy.float64'>'
	with 755031 stored elements in Compressed Sparse Row format>

# Model Selection

In [348]:
from sklearn import metrics

def test_model(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    print("Results for train set:")
    pred_train = clf.predict(X_train)
    print(metrics.classification_report(y_train, pred_train))
    print("Results for test set:")
    pred_test = clf.predict(X_test)
    print(metrics.classification_report(y_test, pred_test))

## Train Test Split

In [166]:
from sklearn.cross_validation import train_test_split

In [349]:
X_train, X_test, y_train, y_test = train_test_split(vectors, y, test_size=0.25)

## Naive Bayes

### Binomial

In [350]:
clf = BernoulliNB(alpha=0.5)
test_model(clf, X_train, X_test, y_train, y_test)

Results for train set:
             precision    recall  f1-score   support

          0       0.70      0.61      0.65      3198
          1       0.86      0.90      0.88      7457
          2       0.70      0.87      0.78     10252
          3       0.88      0.68      0.77      2306
          4       0.94      0.25      0.39      1006
          5       0.74      0.79      0.76      5084
          6       0.79      0.74      0.77      5140
          7       0.63      0.77      0.69      5734
          8       0.82      0.49      0.62      2549
          9       0.78      0.83      0.81      5739
         10       0.78      0.77      0.78      3264
         11       0.87      0.69      0.77      2821
         12       0.84      0.61      0.70      2024
         13       0.74      0.66      0.70      5562
         14       0.90      0.90      0.90     13659

avg / total       0.79      0.78      0.78     75795

Results for test set:
             precision    recall  f1-score   suppor

In [351]:
genre_encoder.inverse_transform(clf.predict(vectorizer.transform(['King Arthur'])))[0]

"Children's Books"

### Multinomial Naive Bayes

In [295]:
clf = MultinomialNB(alpha=0.5)
test_model(clf, X_train, X_test, y_train, y_test)

Results for train set:
             precision    recall  f1-score   support

          0       0.80      0.40      0.53      3205
          1       0.75      0.92      0.83      7503
          2       0.61      0.88      0.72     10158
          3       0.92      0.53      0.67      2285
          4       0.97      0.16      0.28      1010
          5       0.71      0.73      0.72      5092
          6       0.79      0.67      0.73      5116
          7       0.68      0.66      0.67      5707
          8       0.88      0.31      0.46      2590
          9       0.72      0.83      0.77      5733
         10       0.84      0.61      0.70      3236
         11       0.91      0.57      0.70      2792
         12       0.90      0.39      0.54      2008
         13       0.72      0.57      0.63      5607
         14       0.77      0.95      0.85     13753

avg / total       0.75      0.73      0.72     75795

Results for test set:
             precision    recall  f1-score   suppor

In [359]:
genre_encoder.inverse_transform(clf.predict(vectorizer.transform(['The Part and  the Whole Ernest Heisenberg'])))[0]

'Literature & Fiction'