## Imports

In [1]:
import numpy as np
import pandas as pd
import pymongo
from pprint import pprint
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')
import pickle

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from gensim import corpora, models, similarities, matutils

## Load MongoDB

In [2]:
client = pymongo.MongoClient()
db = client.bestsellers_lists

**NYTimes Weekly Bestsellers**

In [3]:
weekly_publications = db.weekly_publications
descriptions = db.descriptions

print('non-distinct counts of MongoDB collections')
print('NYTimes books:',(weekly_publications.count()))
print('Google books:',(descriptions.count()))

non-distinct counts of MongoDB collections
NYTimes books: 121285
Google books: 7203


In [63]:
# NYTimes book
weekly_publications.find_one()

{'_id': ObjectId('5a95fab24f61b66319f1af30'),
 'amazon_product_url': 'http://www.amazon.com/The-Fault-Stars-John-Green-ebook/dp/B005ZOBNOI?tag=NYTBS-20',
 'asterisk': 0,
 'bestsellers_date': '2015-01-17',
 'book_details': [{'age_group': '',
   'author': 'John Green',
   'contributor': 'by John Green',
   'contributor_note': '',
   'description': 'A 16-year-old heroine faces the medical realities of cancer.',
   'price': 0,
   'primary_isbn10': 'None',
   'primary_isbn13': '9781101569184',
   'publisher': 'Penguin Group',
   'title': 'THE FAULT IN OUR STARS'}],
 'dagger': 0,
 'display_name': 'Young Adult',
 'isbns': [{'isbn10': '0525478817', 'isbn13': '9780525478812'},
  {'isbn10': '0141345659', 'isbn13': '9780141345659'},
  {'isbn10': '0147513731', 'isbn13': '9780147513731'},
  {'isbn10': '014242417X', 'isbn13': '9780142424179'},
  {'isbn10': '1594137900', 'isbn13': '9781594137907'},
  {'isbn10': '0525426000', 'isbn13': '9780525426004'}],
 'list_name': 'Young Adult',
 'published_date':

In [64]:
# Google Books
descriptions.find_one()

{'_id': ObjectId('5a97a1d84f61b6966baa0886'),
 'allowAnonLogging': False,
 'authors': ['Joseph J. Ellis'],
 'averageRating': 4.0,
 'canonicalVolumeLink': 'https://books.google.com/books/about/First_Family.html?hl=&id=SPBPyujwygIC',
 'categories': ['Biography & Autobiography'],
 'contentVersion': '1.1.0.0.preview.0',
 'description': 'Presents a narrative profile of the second president and his wife that traces their more than fifty-year partnership in such areas as civic and foreign affairs.',
 'imageLinks': {'smallThumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=5&source=gbs_api',
  'thumbnail': 'http://books.google.com/books/content?id=SPBPyujwygIC&printsec=frontcover&img=1&zoom=1&source=gbs_api'},
 'industryIdentifiers': [{'identifier': '9780307269621', 'type': 'ISBN_13'},
  {'identifier': '0307269620', 'type': 'ISBN_10'}],
 'infoLink': 'http://books.google.com/books?id=SPBPyujwygIC&dq=isbn:9780307269621&hl=&source=gbs_api',
 'language

## Load into Pandas and merge Mongo collections

### NYTimes collection to Pandas

In [4]:
# NYTimes books
nyt_books = []
for doc in weekly_publications.find(
    {}, {'book_details.primary_isbn13': 1, 'bestsellers_date': 1, 'book_details.title': 1, 
         'book_details.author': 1, 'book_details.description': 1, 'amazon_product_url': 1, 
         'isbns': 1, 'list_name': 1,'published_date': 1, 'rank': 1}):
    book = {}
    book['primary_isbn13'] = doc['book_details'][0]['primary_isbn13']
    book['description'] = doc['book_details'][0]['description']
    book['author'] = doc['book_details'][0]['author']
    book['title'] = doc['book_details'][0]['title']
    book['amazon_product_url'] = doc['amazon_product_url']
    book['bestsellers_date'] = doc['bestsellers_date']
    book['published_date'] = doc['published_date']
    book['list_name'] = doc['list_name']
    book['other_isbns'] = doc['isbns']
    book['rank'] = doc['rank']
    book['mongo_id'] = doc['_id']
    nyt_books.append(book)

In [5]:
df_nyt = pd.DataFrame(nyt_books)
df_nyt.head()

Unnamed: 0,amazon_product_url,author,bestsellers_date,description,list_name,mongo_id,other_isbns,primary_isbn13,published_date,rank,title
0,http://www.amazon.com/The-Fault-Stars-John-Gre...,John Green,2015-01-17,A 16-year-old heroine faces the medical realit...,Young Adult,5a95fab24f61b66319f1af30,"[{'isbn13': '9780525478812', 'isbn10': '052547...",9781101569184,2015-02-01,1,THE FAULT IN OUR STARS
1,http://www.amazon.com/If-I-Stay-Gayle-Forman/d...,Gayle Forman,2015-01-17,A young cellist falls into a coma after she su...,Young Adult,5a95fab24f61b66319f1af31,"[{'isbn13': '9780142415436', 'isbn10': '014241...",9781101046340,2015-02-01,2,IF I STAY
2,http://www.amazon.com/Looking-Alaska-John-Gree...,John Green,2015-01-17,A boy finds excitement when he meets a girl na...,Young Adult,5a95fab24f61b66319f1af32,"[{'isbn13': '9780142402511', 'isbn10': '014240...",9780142402511,2015-02-01,3,LOOKING FOR ALASKA
3,http://www.amazon.com/Paper-Towns-John-Green/d...,John Green,2015-01-17,"After a night of mischief, the girl Quentin lo...",Young Adult,5a95fab24f61b66319f1af33,"[{'isbn13': '9780142414934', 'isbn10': '014241...",9780142414934,2015-02-01,4,PAPER TOWNS
4,http://www.amazon.com/Where-She-Went-Gayle-For...,Gayle Forman,2015-01-17,A rock star and a cellist reunite for an eveni...,Young Adult,5a95fab24f61b66319f1af34,"[{'isbn13': '9780525422945', 'isbn10': '052542...",9781101476321,2015-02-01,5,WHERE SHE WENT


In [6]:
df_nyt.shape

(121285, 11)

In [7]:
df_nyt.dtypes

amazon_product_url    object
author                object
bestsellers_date      object
description           object
list_name             object
mongo_id              object
other_isbns           object
primary_isbn13        object
published_date        object
rank                   int64
title                 object
dtype: object

In [8]:
df_nyt.sort_values(by=['title','author','rank'], inplace=True, na_position='first')
df_nyt.drop_duplicates(subset=['title','author'], inplace=True)
df_nyt.reset_index(drop=True, inplace=True)
df_nyt.rename(columns={'rank': 'best_rank'}, inplace=True)
df_nyt.shape

(11943, 11)

In [9]:
df_nyt.head()

Unnamed: 0,amazon_product_url,author,bestsellers_date,description,list_name,mongo_id,other_isbns,primary_isbn13,published_date,best_rank,title
0,http://www.amazon.com/Give-You-My-Body-Scenes-...,Diana Gabaldon,2016-08-20,The author of the Outlander novels gives tips ...,Advice How-To and Miscellaneous,5a98e57d4f61b6bad451b147,"[{'isbn13': '9780399178573', 'isbn10': '039917...",9780399178573,2016-09-04,8,"""I GIVE YOU MY BODY ..."""
1,http://www.amazon.com/Most-Blessed-Patriarchs-...,Annette Gordon-Reed and Peter S Onuf,2016-04-16,A character study that attempts to make sense ...,Hardcover Nonfiction,5a9755514f61b68dd7648074,"[{'isbn13': '9780871404428', 'isbn10': '087140...",9780871404428,2016-05-01,16,"""MOST BLESSED OF THE PATRIARCHS"""
2,http://www.amazon.com/AskGaryVee-Entrepreneurs...,Gary Vaynerchuk,2016-03-12,The entrepreneur expands on subjects addressed...,Advice How-To and Miscellaneous,5a98e5654f61b6bad451afec,"[{'isbn13': '9780062273123', 'isbn10': '006227...",9780062273123,2016-03-27,6,#ASKGARYVEE
3,http://www.amazon.com/GIRLBOSS-Sophia-Amoruso/...,Sophia Amoruso,2014-05-10,An online fashion retailer traces her path to ...,Advice How-To and Miscellaneous,5a98e5014f61b6bad451a8fe,"[{'isbn13': '9780399169274', 'isbn10': '039916...",9780399169274,2014-05-25,2,#GIRLBOSS
4,http://www.amazon.com/The-100-Startup-Reinvent...,Chris Guillebeau,2012-05-26,How to turn ideas into income.,Hardcover Advice,5a98dfed4f61b6bad451824c,"[{'isbn13': '9780307951526', 'isbn10': '030795...",9780307951526,2012-06-10,6,$100 STARTUP


In [10]:
df_nyt.isnull().sum().sort_values(ascending=False)

amazon_product_url    101
description            10
title                   0
best_rank               0
published_date          0
primary_isbn13          0
other_isbns             0
mongo_id                0
list_name               0
bestsellers_date        0
author                  0
dtype: int64

In [11]:
print('NYT records with None (na) description:',df_nyt[df_nyt['description'].isnull()].count().max())
print('NYT records with blank ("") description:',df_nyt[df_nyt['description']==''].count().max())
print('NYT records with >3 character description:',df_nyt[df_nyt['description'].str.len()>2].count().max())

NYT records with None (na) description: 10
NYT records with blank ("") description: 1280
NYT records with >3 character description: 10653


- these three groups of description values equal the correct total of NYT records

In [12]:
df_nyt.description.fillna(value=np.nan, inplace=True) # replace None desc with NaN
blank_desc = df_nyt[df_nyt['description']==''].index
df_nyt.iloc[blank_desc,3] = np.nan # replace blank desc with NaN

In [13]:
print('NYT records with None (na) description:',df_nyt[df_nyt['description'].isnull()].count().max())
print('NYT records with blank ("") description:',df_nyt[df_nyt['description']==''].count().max())
print('NYT records with >3 character description:',df_nyt[df_nyt['description'].str.len()>2].count().max())

NYT records with None (na) description: 1290
NYT records with blank ("") description: 0
NYT records with >3 character description: 10653


In [14]:
df_nyt[df_nyt['description'].isnull()]['description'].unique()

array([nan], dtype=object)

### Google Books collection to Pandas

In [15]:
# Google Books
google_books = []
for doc in descriptions.find(
    { '$and': [{'authors': {'$nin': [None,'']}}, {'title': {'$nin': [None,'']}}, 
               {'nyt_isbn13': {'$nin': [None,'']}}, {'description': {'$nin': [None,'']}}]}, 
    { 'title': 1, 'nyt_isbn13': 1, 'description': 1, 'authors': 1}):
    authors = ''
    count = 0
    for name in doc['authors']:
        if count > 0:
            authors += ', '
        authors += name
        count += 1
    book = {}
    book['authors'] = authors
    book['title'] = doc['title']
    book['nyt_isbn13'] = doc['nyt_isbn13']
    book['description'] = doc['description']
    book['mongo_id'] = doc['_id']
    google_books.append(book)

In [16]:
df_gb = pd.DataFrame(google_books)
df_gb.head()

Unnamed: 0,authors,description,mongo_id,nyt_isbn13,title
0,Joseph J. Ellis,Presents a narrative profile of the second pre...,5a97a1d84f61b6966baa0886,9780307269621,First Family
1,Edward Conard,Presents a counterintuitive assessment of the ...,5a97a1d94f61b6966baa0887,9781591845508,Unintended Consequences
2,Bruce Schneier,You are under surveillance right now. Your cel...,5a97a1da4f61b6966baa0888,9780393244816,Data and Goliath
3,Malcolm Gladwell,"Identifies the qualities of successful people,...",5a97a1db4f61b6966baa0889,9780316017923,Outliers
4,Anthony Shadid,A journalist traces the story of his family's ...,5a97a1dd4f61b6966baa088a,9780547134666,House of Stone


In [17]:
df_gb.shape

(7074, 5)

In [18]:
df_gb.dtypes

authors        object
description    object
mongo_id       object
nyt_isbn13     object
title          object
dtype: object

In [19]:
print('Google records with None (na) description:',df_gb[df_gb['description'].isnull()].count().max())
print('Google records with blank ("") description:',df_gb[df_gb['description']==''].count().max())
print('Gogle records with >3 character description:',df_gb[df_gb['description'].str.len()>1].count().max())

Google records with None (na) description: 0
Google records with blank ("") description: 0
Gogle records with >3 character description: 7074


In [20]:
df_gb[df_gb['description'].str.len()<37]

Unnamed: 0,authors,description,mongo_id,nyt_isbn13,title
368,Bill O'Reilly,No Marketing Blurb,5a97a3ac4f61b6966baa0a03,9780385346627,Keep it Pithy
1570,Elizabeth Gilbert,GILBERT/EAT PRAY LOVE,5a98f4174f61b6bae1aa7a97,9780143038412,"Eat, Pray, Love"
1948,David Allen,ALLEN/GETTING THINGS DONE,5a9a3cfa4f61b6d59c366edc,9780142000281,Getting Things Done
1981,Caldwell B. Esselstyn,ESSELSTYN/PREVENT AND REVERSE,5a9a3d194f61b6d59c366efd,9781583333006,Prevent and Reverse Heart Disease
2057,Trisha Yearwood,No Marketing Blurb,5a9a3d6a4f61b6d59c366f49,9780307465238,Home Cooking with Trisha Yearwood
2336,Eckhart Tolle,TOLLE/NEW EARTH (OPRAH 61),5a9a3e9b4f61b6d59c367066,9780452289963,A New Earth
2916,Nick Offerman,"""""",5a9b4bca4f61b6ed19a8b982,9781101984659,Good Clean Fun
3131,Abbi Jacobson,"""""",5a9b4c284f61b6ed19a8ba5e,9780735221598,Carry This Book
4286,Julie Garwood,"""""",5a9e1f034f61b62600fa5eb6,9780345500786,Sizzle
4306,Mike Maden,"""The next Jack Ryan, Jr. novel""--",5a9e1f0f4f61b62600fa5eca,9780735215863,Tom Clancy Point of Contact


In [21]:
bad_desc = df_gb[df_gb['description'].str.len()<37].index
df_gb.iloc[bad_desc,1] = np.nan # replace bad desc with NaN

In [22]:
print('Google records with None (na) description:',df_gb[df_gb['description'].isnull()].count().max())
print('Google records with blank ("") description:',df_gb[df_gb['description']==''].count().max())
print('Gogle records with >3 character description:',df_gb[df_gb['description'].str.len()>1].count().max())

Google records with None (na) description: 15
Google records with blank ("") description: 0
Gogle records with >3 character description: 7059


### Merge NYT & Google Books Data

In [23]:
df = df_nyt.merge(df_gb, how='left', left_on='primary_isbn13', right_on='nyt_isbn13', indicator='merge_match',
                  suffixes=('_nyt', '_google'))
df.shape

(11977, 17)

In [24]:
df.head()

Unnamed: 0,amazon_product_url,author,bestsellers_date,description_nyt,list_name,mongo_id_nyt,other_isbns,primary_isbn13,published_date,best_rank,title_nyt,authors,description_google,mongo_id_google,nyt_isbn13,title_google,merge_match
0,http://www.amazon.com/Give-You-My-Body-Scenes-...,Diana Gabaldon,2016-08-20,The author of the Outlander novels gives tips ...,Advice How-To and Miscellaneous,5a98e57d4f61b6bad451b147,"[{'isbn13': '9780399178573', 'isbn10': '039917...",9780399178573,2016-09-04,8,"""I GIVE YOU MY BODY ...""",Diana Gabaldon,NEW YORK TIMES BESTSELLER • For writers lookin...,5a9a3e7b4f61b6d59c367047,9780399178573,"""I Give You My Body . . .""",both
1,http://www.amazon.com/Most-Blessed-Patriarchs-...,Annette Gordon-Reed and Peter S Onuf,2016-04-16,A character study that attempts to make sense ...,Hardcover Nonfiction,5a9755514f61b68dd7648074,"[{'isbn13': '9780871404428', 'isbn10': '087140...",9780871404428,2016-05-01,16,"""MOST BLESSED OF THE PATRIARCHS""","Annette Gordon-Reed, Peter S. Onuf",A groundbreaking work of history that explicat...,5a97a2954f61b6966baa0905,9780871404428,Most Blessed of the Patriarchs,both
2,http://www.amazon.com/AskGaryVee-Entrepreneurs...,Gary Vaynerchuk,2016-03-12,The entrepreneur expands on subjects addressed...,Advice How-To and Miscellaneous,5a98e5654f61b6bad451afec,"[{'isbn13': '9780062273123', 'isbn10': '006227...",9780062273123,2016-03-27,6,#ASKGARYVEE,Gary Vaynerchuk,The New York Times bestselling author draws fr...,5a9b4bb44f61b6ed19a8b951,9780062273123,#AskGaryVee,both
3,http://www.amazon.com/GIRLBOSS-Sophia-Amoruso/...,Sophia Amoruso,2014-05-10,An online fashion retailer traces her path to ...,Advice How-To and Miscellaneous,5a98e5014f61b6bad451a8fe,"[{'isbn13': '9780399169274', 'isbn10': '039916...",9780399169274,2014-05-25,2,#GIRLBOSS,Sophia Amoruso,The founder of the Nasty Gal fashion e-tailer ...,5a9b4b714f61b6ed19a8b8b9,9780399169274,#Girlboss,both
4,http://www.amazon.com/The-100-Startup-Reinvent...,Chris Guillebeau,2012-05-26,How to turn ideas into income.,Hardcover Advice,5a98dfed4f61b6bad451824c,"[{'isbn13': '9780307951526', 'isbn10': '030795...",9780307951526,2012-06-10,6,$100 STARTUP,Chris Guillebeau,Shares advice for transitioning away from unfu...,5a9a3dcf4f61b6d59c366fa8,9780307951526,The $100 Startup,both


In [25]:
# merge results based on isbn13 key
df['merge_match'].value_counts()

left_only     6735
both          5242
right_only       0
Name: merge_match, dtype: int64

- About 74% of Google records matched with NYT based on a single ISBN13 (did not try alternative ISBNs for the same title/author combination)

In [26]:
# count NaN descriptions
print('nyt desc null:',df['description_nyt'].isnull().sum())
print('google desc null:',df['description_google'].isnull().sum())
print('both desc null:',df[(df['description_nyt'].isnull()) & (df['description_google'].isnull())].count().max())
print('neither desc null:',df[(df['description_nyt'].notnull()) & (df['description_google'].notnull())].count().max())

nyt desc null: 1290
google desc null: 6747
both desc null: 650
neither desc null: 4590


- They correctly add up after subtracting "both desc null" from NYT and Google nulls.

In [27]:
# for 'nyt null' use the description from Google
desc_nyt_null = df[df['description_nyt']!=df['description_nyt']].index
df.loc[desc_nyt_null,'description_primary'] = df['description_google']

In [28]:
# for 'gb null' use description from NYT
desc_gb_null = df[df['description_google']!=df['description_google']].index
df.loc[desc_gb_null,'description_primary'] = df['description_nyt']

In [29]:
# calculate length of both description fields
df['nyt_desc_len'] = df['description_nyt'].str.len()
df['google_desc_len'] = df['description_google'].str.len()

In [30]:
df.head()

Unnamed: 0,amazon_product_url,author,bestsellers_date,description_nyt,list_name,mongo_id_nyt,other_isbns,primary_isbn13,published_date,best_rank,title_nyt,authors,description_google,mongo_id_google,nyt_isbn13,title_google,merge_match,description_primary,nyt_desc_len,google_desc_len
0,http://www.amazon.com/Give-You-My-Body-Scenes-...,Diana Gabaldon,2016-08-20,The author of the Outlander novels gives tips ...,Advice How-To and Miscellaneous,5a98e57d4f61b6bad451b147,"[{'isbn13': '9780399178573', 'isbn10': '039917...",9780399178573,2016-09-04,8,"""I GIVE YOU MY BODY ...""",Diana Gabaldon,NEW YORK TIMES BESTSELLER • For writers lookin...,5a9a3e7b4f61b6d59c367047,9780399178573,"""I Give You My Body . . .""",both,,104.0,1115.0
1,http://www.amazon.com/Most-Blessed-Patriarchs-...,Annette Gordon-Reed and Peter S Onuf,2016-04-16,A character study that attempts to make sense ...,Hardcover Nonfiction,5a9755514f61b68dd7648074,"[{'isbn13': '9780871404428', 'isbn10': '087140...",9780871404428,2016-05-01,16,"""MOST BLESSED OF THE PATRIARCHS""","Annette Gordon-Reed, Peter S. Onuf",A groundbreaking work of history that explicat...,5a97a2954f61b6966baa0905,9780871404428,Most Blessed of the Patriarchs,both,,76.0,145.0
2,http://www.amazon.com/AskGaryVee-Entrepreneurs...,Gary Vaynerchuk,2016-03-12,The entrepreneur expands on subjects addressed...,Advice How-To and Miscellaneous,5a98e5654f61b6bad451afec,"[{'isbn13': '9780062273123', 'isbn10': '006227...",9780062273123,2016-03-27,6,#ASKGARYVEE,Gary Vaynerchuk,The New York Times bestselling author draws fr...,5a9b4bb44f61b6ed19a8b951,9780062273123,#AskGaryVee,both,,113.0,1428.0
3,http://www.amazon.com/GIRLBOSS-Sophia-Amoruso/...,Sophia Amoruso,2014-05-10,An online fashion retailer traces her path to ...,Advice How-To and Miscellaneous,5a98e5014f61b6bad451a8fe,"[{'isbn13': '9780399169274', 'isbn10': '039916...",9780399169274,2014-05-25,2,#GIRLBOSS,Sophia Amoruso,The founder of the Nasty Gal fashion e-tailer ...,5a9b4b714f61b6ed19a8b8b9,9780399169274,#Girlboss,both,,54.0,275.0
4,http://www.amazon.com/The-100-Startup-Reinvent...,Chris Guillebeau,2012-05-26,How to turn ideas into income.,Hardcover Advice,5a98dfed4f61b6bad451824c,"[{'isbn13': '9780307951526', 'isbn10': '030795...",9780307951526,2012-06-10,6,$100 STARTUP,Chris Guillebeau,Shares advice for transitioning away from unfu...,5a9a3dcf4f61b6d59c366fa8,9780307951526,The $100 Startup,both,,30.0,272.0


In [31]:
print('avg length of NYT descriptions:',df['nyt_desc_len'].mean())
print('avg length of Google descriptions:',df['google_desc_len'].mean())

avg length of NYT descriptions: 99.31964068494432
avg length of Google descriptions: 598.4414913957935


In [32]:
print('avg length of NYT titles:',df['title_nyt'].str.len().mean())
print('avg length of Google titles:',df['title_google'].str.len().mean())

avg length of NYT titles: 18.185689237705603
avg length of Google titles: 17.980732544830218


In [33]:
# use whichever is longer for primary description field, default to NYT for same length
df.loc[df['nyt_desc_len'] > df['google_desc_len'], 'description_primary'] = df['description_nyt']
df.loc[df['nyt_desc_len'] < df['google_desc_len'], 'description_primary'] = df['description_google']
df.loc[df['nyt_desc_len'] == df['google_desc_len'], 'description_primary'] = df['description_nyt']
df.head()

Unnamed: 0,amazon_product_url,author,bestsellers_date,description_nyt,list_name,mongo_id_nyt,other_isbns,primary_isbn13,published_date,best_rank,title_nyt,authors,description_google,mongo_id_google,nyt_isbn13,title_google,merge_match,description_primary,nyt_desc_len,google_desc_len
0,http://www.amazon.com/Give-You-My-Body-Scenes-...,Diana Gabaldon,2016-08-20,The author of the Outlander novels gives tips ...,Advice How-To and Miscellaneous,5a98e57d4f61b6bad451b147,"[{'isbn13': '9780399178573', 'isbn10': '039917...",9780399178573,2016-09-04,8,"""I GIVE YOU MY BODY ...""",Diana Gabaldon,NEW YORK TIMES BESTSELLER • For writers lookin...,5a9a3e7b4f61b6d59c367047,9780399178573,"""I Give You My Body . . .""",both,NEW YORK TIMES BESTSELLER • For writers lookin...,104.0,1115.0
1,http://www.amazon.com/Most-Blessed-Patriarchs-...,Annette Gordon-Reed and Peter S Onuf,2016-04-16,A character study that attempts to make sense ...,Hardcover Nonfiction,5a9755514f61b68dd7648074,"[{'isbn13': '9780871404428', 'isbn10': '087140...",9780871404428,2016-05-01,16,"""MOST BLESSED OF THE PATRIARCHS""","Annette Gordon-Reed, Peter S. Onuf",A groundbreaking work of history that explicat...,5a97a2954f61b6966baa0905,9780871404428,Most Blessed of the Patriarchs,both,A groundbreaking work of history that explicat...,76.0,145.0
2,http://www.amazon.com/AskGaryVee-Entrepreneurs...,Gary Vaynerchuk,2016-03-12,The entrepreneur expands on subjects addressed...,Advice How-To and Miscellaneous,5a98e5654f61b6bad451afec,"[{'isbn13': '9780062273123', 'isbn10': '006227...",9780062273123,2016-03-27,6,#ASKGARYVEE,Gary Vaynerchuk,The New York Times bestselling author draws fr...,5a9b4bb44f61b6ed19a8b951,9780062273123,#AskGaryVee,both,The New York Times bestselling author draws fr...,113.0,1428.0
3,http://www.amazon.com/GIRLBOSS-Sophia-Amoruso/...,Sophia Amoruso,2014-05-10,An online fashion retailer traces her path to ...,Advice How-To and Miscellaneous,5a98e5014f61b6bad451a8fe,"[{'isbn13': '9780399169274', 'isbn10': '039916...",9780399169274,2014-05-25,2,#GIRLBOSS,Sophia Amoruso,The founder of the Nasty Gal fashion e-tailer ...,5a9b4b714f61b6ed19a8b8b9,9780399169274,#Girlboss,both,The founder of the Nasty Gal fashion e-tailer ...,54.0,275.0
4,http://www.amazon.com/The-100-Startup-Reinvent...,Chris Guillebeau,2012-05-26,How to turn ideas into income.,Hardcover Advice,5a98dfed4f61b6bad451824c,"[{'isbn13': '9780307951526', 'isbn10': '030795...",9780307951526,2012-06-10,6,$100 STARTUP,Chris Guillebeau,Shares advice for transitioning away from unfu...,5a9a3dcf4f61b6d59c366fa8,9780307951526,The $100 Startup,both,Shares advice for transitioning away from unfu...,30.0,272.0


In [34]:
# book title probably carries a lot of meaning, so add to primary description
df['description_primary'] = df['title_nyt'] + ' ' + df['description_primary']
df['description_primary'].head()

0    "I GIVE YOU MY BODY ..." NEW YORK TIMES BESTSE...
1    "MOST BLESSED OF THE PATRIARCHS" A groundbreak...
2    #ASKGARYVEE The New York Times bestselling aut...
3    #GIRLBOSS The founder of the Nasty Gal fashion...
4    $100 STARTUP Shares advice for transitioning a...
Name: description_primary, dtype: object

## Pickle dataframe for topic modeling

In [35]:
with open('merged_books_descriptions_pandas_df_2018-03-08.pkl', 'wb') as f:
    pickle.dump(df, f)