In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

import gensim
from gensim.models import Doc2Vec 
from gensim.models.doc2vec import TaggedDocument

import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn import utils
from sklearn.metrics import accuracy_score, f1_score

import re
import seaborn as sns
import matplotlib.pyplot as plt

import multiprocessing

import warnings;
warnings.filterwarnings('ignore')

  from pandas import Panel


In [2]:
df = pd.read_csv('book_dataframe.csv', index_col=0)
df.head()

Unnamed: 0,title,author,overview,genre
0,Dungeons &amp; Dragons Player's Handbook (Core...,Wizards RPG Team,Create heroic characters for the world’s great...,Activity & Game Books
1,Tasha's Cauldron of Everything (D&amp;D Rules ...,Wizards RPG Team,A magical mixture of rules options for the wor...,Activity & Game Books
2,"Card Night: Classic Games, Classic Decks, and ...",Will Roya,Learn when to hold 'em and when to fold 'em wi...,Activity & Game Books
3,The Queen's Gambit (Television Tie-in),Walter Tevis,"Engaging and fast-paced, this gripping coming-...",Activity & Game Books
4,The Answer Is...: Reflections on My Life,Alex Trebek,A RECOMMENDED SUMMER READ BY THE NEW YORK TIME...,Activity & Game Books


In [3]:
df.shape

(1520, 4)

In [4]:
#make a list of genres
genres = df['genre'].unique().tolist()

In [5]:
#group dataframe by genre
grouped_by_genre =  df.groupby(df.genre)    

In [6]:
#make dictionary with keys = genre and values = df groupedby that genre
genre_dict = dict.fromkeys(genres, 0)
for genre in genres:
    genre_dict[genre] = grouped_by_genre.get_group(genre)

We want to make sure that our train_df takes randomly 80% and not random 80% of the entire original dataframe so in our training set will contain the same amount of examples from each genre.  

In [7]:
training = [dataframes.sample(frac = 0.8) for dataframes in genre_dict.values()]

In [8]:
train_df = pd.concat(training)

In [9]:
test_df = df.drop(train_df.index)

In [10]:
#fixes some indexing issue which hopefully can be resolved more cleverly
train_df = train_df.reset_index()
test_df = test_df.reset_index()

In [11]:
train_df.drop('index', axis = 1, inplace = True)
test_df.drop('index', axis = 1, inplace = True)

We will first use doc2vec for prediction of genre using titles of the books

In [12]:
train_title_df = train_df[['title', 'genre']]
test_title_df = test_df[['title', 'genre']]

In [13]:
test_title_df.head()

Unnamed: 0,title,genre
0,"Card Night: Classic Games, Classic Decks, and ...",Activity & Game Books
1,Candlekeep Mysteries (D&amp;D Adventure Book -...,Activity & Game Books
2,The Anatomy Coloring Book,Activity & Game Books
3,"399 Games, Puzzles &amp; Trivia Challenges Spe...",Activity & Game Books
4,New York Times Sunday Crossword Puzzles Volume 30,Activity & Game Books


In [14]:
#define function to remove punctuations and symbols etc., (maybe theres a better way than this?)
def clean_text(text): 
    text = re.sub(r'&', r' ', text)
    text = re.sub(r'#', r' ', text)
    text = re.sub(r';', r' ', text)
    text = re.sub(r':', r' ', text)
    text = re.sub(r'!', r' ', text)
    text = re.sub(r'\d', r' ', text)
    text = text.lower()
    text = text.replace('.', '')
    text = text.replace('"', '')
    text = text.replace('(', '')
    text = text.replace(')', '')
    return text

#define a function to remove stop-words and tokenize text using NLTK tokenizer.
def tokenize_text(text):
    tokens = []
    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    
    tokens_without_sw = [w for w in tokens if not w in stopwords.words('english')]
    return tokens_without_sw


        

In [15]:
train_title_df['title'] = train_title_df['title'].apply(lambda r: clean_text(r))
test_title_df['title'] = test_title_df['title'].apply(lambda r: clean_text(r))

Honestly, one should have done this before splitting to training and testing

In [16]:
train_title = train_title_df.apply(lambda r: TaggedDocument(words=tokenize_text(r['title']), tags=[r.genre]), axis=1)
test_title = test_title_df.apply(lambda r: TaggedDocument(words=tokenize_text(r['title']), tags=[r.genre]), axis=1)

In [17]:
train_title.values

array([TaggedDocument(words=['large', 'print', 'word', 'search', 'puzzles'], tags=['Activity & Game Books']),
       TaggedDocument(words=['apex', 'legends', 'pathfinder', "'s", 'quest', 'lore', 'book'], tags=['Activity & Game Books']),
       TaggedDocument(words=['volo', "'s", 'guide', 'monsters'], tags=['Activity & Game Books']),
       ...,
       TaggedDocument(words=["'ll", 'never', 'french', 'matter', 'living', 'small', 'village', 'brittany'], tags=['Travel']),
       TaggedDocument(words=['humans', 'amp', 'exclusive', 'edition'], tags=['Travel']),
       TaggedDocument(words=['catch', 'kill', 'lies', 'spies', 'conspiracy', 'protect', 'predators'], tags=['Travel'])],
      dtype=object)

In [18]:

cores = multiprocessing.cpu_count()

In [19]:
model = Doc2Vec(dm=0, vector_size=30, window = 5, min_count=1, sample = 1e5, workers=cores)
model.build_vocab([x for x in tqdm(train_title.values)])

100%|██████████| 1216/1216 [00:00<00:00, 1084703.03it/s]


In [20]:
[x for x in tqdm(train_title.values)]

100%|██████████| 1216/1216 [00:00<00:00, 143378.88it/s]


[TaggedDocument(words=['large', 'print', 'word', 'search', 'puzzles'], tags=['Activity & Game Books']),
 TaggedDocument(words=['apex', 'legends', 'pathfinder', "'s", 'quest', 'lore', 'book'], tags=['Activity & Game Books']),
 TaggedDocument(words=['volo', "'s", 'guide', 'monsters'], tags=['Activity & Game Books']),
 TaggedDocument(words=['queen', "'s", 'gambit', 'television', 'tie-in'], tags=['Activity & Game Books']),
 TaggedDocument(words=['bobby', 'fischer', 'teaches', 'chess'], tags=['Activity & Game Books']),
 TaggedDocument(words=['beautiful', 'nature', 'coloring', 'book'], tags=['Activity & Game Books']),
 TaggedDocument(words=['new', 'york', 'times', 'best', 'week', 'series', 'monday', 'crosswords', 'easy', 'puzzles'], tags=['Activity & Game Books']),
 TaggedDocument(words=['tasha', "'s", 'cauldron', 'everything', 'amp', 'rules', 'expansion', 'dungeons', 'amp', 'dragons'], tags=['Activity & Game Books']),
 TaggedDocument(words=['new', 'york', 'times', 'monday', 'friday', 'easy'

In [21]:
%%time
for epoch in range(30):
    model.train([x for x in tqdm(train_title.values)], total_examples=len(train_title.values), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

100%|██████████| 1216/1216 [00:00<00:00, 277068.32it/s]
100%|██████████| 1216/1216 [00:00<00:00, 768229.20it/s]
100%|██████████| 1216/1216 [00:00<00:00, 209766.95it/s]
100%|██████████| 1216/1216 [00:00<00:00, 515908.73it/s]
100%|██████████| 1216/1216 [00:00<00:00, 437603.92it/s]
100%|██████████| 1216/1216 [00:00<00:00, 777362.24it/s]
100%|██████████| 1216/1216 [00:00<00:00, 781411.62it/s]
100%|██████████| 1216/1216 [00:00<00:00, 726431.23it/s]
100%|██████████| 1216/1216 [00:00<00:00, 912229.24it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1497437.95it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1449765.11it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1247010.68it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1117745.71it/s]
100%|██████████| 1216/1216 [00:00<00:00, 880724.17it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1188597.92it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1293500.80it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1084703.03it/s]
100%|██████████| 1216/1216 [00:00<00:00, 

CPU times: user 1.49 s, sys: 89.2 ms, total: 1.58 s
Wall time: 1.82 s


In [22]:
#function which better 'converges' vectors for tagged words for each tag
def learning_vec(model, tagged_docs):
    sents = tagged_docs.values 
    genre, arr_of_words = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=100)) for doc in sents])
    return genre, arr_of_words

In [23]:
y_train, X_train = learning_vec(model, train_title)
y_test, X_test = learning_vec(model, test_title)
logreg = LogisticRegression(solver='liblinear', C=1e5)

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [24]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

Testing accuracy 0.05263157894736842


Trying Summary 

In [25]:
train_summ_df = train_df[['overview', 'genre']]
test_summ_df = test_df[['overview', 'genre']]

In [26]:
train_summ_df['overview'] = train_summ_df['overview'].apply(lambda r: clean_text(r))
test_summ_df['overview'] = test_summ_df['overview'].apply(lambda r: clean_text(r))

In [27]:
train_summ = train_summ_df.apply(lambda r: TaggedDocument(words=tokenize_text(r['overview']), tags=[r.genre]), axis=1)
test_summ = test_summ_df.apply(lambda r: TaggedDocument(words=tokenize_text(r['overview']), tags=[r.genre]), axis=1)

In [28]:
model = Doc2Vec(dm=0, vector_size=30, negative=5, window = 5, hs=0, min_count=1, sample = 1e5, workers=cores)
model.build_vocab([x for x in tqdm(train_summ.values)])

100%|██████████| 1216/1216 [00:00<00:00, 59409.13it/s]


In [29]:
%%time
for epoch in range(30):
    model.train(utils.shuffle([x for x in tqdm(train_summ.values)]), total_examples=len(train_summ.values), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

100%|██████████| 1216/1216 [00:00<00:00, 36856.48it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1643129.40it/s]
100%|██████████| 1216/1216 [00:00<00:00, 2163883.61it/s]
100%|██████████| 1216/1216 [00:00<00:00, 718247.24it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1767246.59it/s]
100%|██████████| 1216/1216 [00:00<00:00, 867393.48it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1518842.66it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1493929.02it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1724813.55it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1367732.28it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1654321.66it/s]
100%|██████████| 1216/1216 [00:00<00:00, 2026330.42it/s]
100%|██████████| 1216/1216 [00:00<00:00, 940489.34it/s]
100%|██████████| 1216/1216 [00:00<00:00, 1683814.35it/s]
100%|██████████| 1216/1216 [00:00<00:00, 708272.97it/s]
100%|██████████| 1216/1216 [00:00<00:00, 734274.93it/s]
100%|██████████| 1216/1216 [00:00<00:00, 809567.25it/s]
100%|██████████| 1216/1216 [00:00<00:00

CPU times: user 11.7 s, sys: 487 ms, total: 12.2 s
Wall time: 6.69 s


In [30]:
y_train, X_train = learning_vec(model, train_summ)
y_test, X_test = learning_vec(model, test_summ)
logreg = LogisticRegression(solver='liblinear', C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [31]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))

Testing accuracy 0.12828947368421054


It might be the case that we don't have enough data to categorize. 
<br>Lets try with more data and with fewer categories and see if our model works better. 

In [32]:
small_df = pd.read_csv('new_releases_Horror&Romance.csv')