In [1]:
import numpy as np
import pandas as pd
import re
import fuzzyset
from fuzzywuzzy import fuzz
import collections
import pandas_import_and_export
from constants import *

sep = " ~ " 
#a special character that will be used later

# Summer Reading Book Clustering, part 3: Fuzzy matching

Now that we matched some of the entries to books in the OpenLibrary database, we still have two big tasks ahead of us:
* figure out what to do with the entries that were not matched
* cluster all entries so that a series of books can be viewed together

First, import the csv of matched books created in part 2:

In [4]:
df = pandas_import_and_export.read_csv("matchedbooks")
df[:10]

Unnamed: 0,title,author,num_times_read,num_readers,matched_title,matched_author
0,chestnut street,maeve binchy,1,1,,
1,deerskin,robin mckinley,1,1,Deerskin,Robin McKinley
2,dreams of gods and monsters,laini taylor,1,1,,
3,gone with the wind,margaret mitchell,1,1,Gone with the wind,Margaret Mitchell
4,green mansions: a romance of the tropical forest,w. h. hudson,1,1,,
5,herzog,saul bellow,1,1,Herzog,Saul Bellow
6,homegoing,yaa gyasi,1,1,Homegoing,Yaa Gyasi
7,horrible bear!,ame dyckman,1,1,,
8,it's okay to make mistakes,todd parr,5,1,,
9,joe gould's secret,joseph mitchell,1,1,Joe Gould's Secret,Joseph Mitchell


First off, we'll clean all titles and authors.  

"title" and "author" fields should already have been cleaned in part 1, but there are a few steps of cleaning here that weren't done before so we'll clean them anyway.

Matched titles and authors are directly from openlibrary and so far completely unprocessed.  A handful of openlibrary results have weird things like newlines or a lot of spaces in the middle of the titles, so we'll get rid of those specifically.

In [5]:
def consolidate_spaces(s):
    #when there are multiple spaces in a row, replace with one space.
    return re.sub(' +', ' ', s)

def remove_bad_chars(s):
    bad_chars = sep.strip() + "\n\r\t"
    return ''.join([i for i in s if not (i in bad_chars)])

def clean(s):
    s = remove_bad_chars(s)
    s = consolidate_spaces(s)
    s = s.strip().lower()
    return s
        
df['matched_title'] = df['matched_title'].apply(clean)
df['matched_author'] = df['matched_author'].apply(clean)
df['title'] = df['title'].apply(clean)
df['author'] = df['author'].apply(clean)
df[:10]

Unnamed: 0,title,author,num_times_read,num_readers,matched_title,matched_author
0,chestnut street,maeve binchy,1,1,,
1,deerskin,robin mckinley,1,1,deerskin,robin mckinley
2,dreams of gods and monsters,laini taylor,1,1,,
3,gone with the wind,margaret mitchell,1,1,gone with the wind,margaret mitchell
4,green mansions: a romance of the tropical forest,w. h. hudson,1,1,,
5,herzog,saul bellow,1,1,herzog,saul bellow
6,homegoing,yaa gyasi,1,1,homegoing,yaa gyasi
7,horrible bear!,ame dyckman,1,1,,
8,it's okay to make mistakes,todd parr,5,1,,
9,joe gould's secret,joseph mitchell,1,1,joe gould's secret,joseph mitchell


Split the matched and unmatched books into two tables so we can do separate operations on them.

In [6]:
dfu = df[(df.matched_author == '') & (df.matched_title == '')]
dfm = df[~((df.matched_author == '') & (df.matched_title == ''))]

del dfu['matched_author']
del dfu['matched_title']

Aggregate rows of dfm that have the exact same matched_title and matched_author.

This is not strictly necessary, but helps with the next step--we will be creating lists of all the variations of names for the same book. If we don't do the initial aggregation, those lists will have a bunch of identical names on them.

Just in case we need them later, we will save the inital user-entered info in the "title" and "author" fields, joining them with the sep character.

In [7]:
dfm = dfm.groupby(
    ['matched_title', 'matched_author'],
    as_index = False
).agg(
    {'num_times_read':np.sum,
     'num_readers':np.sum,
     'title': (lambda l: sep.join(l)),
     'author': (lambda l: sep.join(l))}
)
dfm[:10]

Unnamed: 0,matched_title,matched_author,num_times_read,num_readers,title,author
0,deerskin,robin mckinley,1,1,deerskin,robin mckinley
1,gone with the wind,margaret mitchell,1,1,gone with the wind,margaret mitchell
2,herzog,saul bellow,1,1,herzog,saul bellow
3,homegoing,yaa gyasi,1,1,homegoing,yaa gyasi
4,joe gould's secret,joseph mitchell,1,1,joe gould's secret,joseph mitchell
5,ready player one,ernest cline,1,1,ready player one,ernest cline
6,tender is the night,f. scott fitzgerald,1,1,tender is the night,f. scott fitzgerald


In [None]:
#save dfm and dfu as csv's for recordkeeping purposes
pandas_import_and_export.to_csv(dfu, "dfu")
pandas_import_and_export.to_csv(dfm, "dfm")

# It's clustering time! >:D
The is where the real work starts.  The end goal of this part is to create book clusters, representing either a single book or a series.

### The Book object
* count - total number of times the books was read
* readers - total number of entries for that book (some entries report reading a book multiple times)
* is_openlibrary - whether book's title and author are openlibrary search results or user-entered
* other - for openlibrary books, use entered titles and authors that matched to those (not currently used for anything)

In [9]:
#general functions used in the book object

#for use with fuzzydict, see below
def make_key(title, author):
    return title + sep + author

In [10]:
class Book:
    
    def __init__(self, count, readers, title, author, is_openlibrary,
                 other=None):

        self.count = count
        self.readers = readers
        self.title = title
        self.author = author
        self.is_openlibrary = is_openlibrary
        self.other = other
        
        if other != None: assert(is_openlibrary)
        
    def equals(self, book):
        return self.title == book.title and self.author == book.author
    
    def rep(self):
        return (self.count, self.readers, self.title, self.author)
        
    def key(self):
        return make_key(self.title, self.author)
        

In [11]:
b1 = Book(3, 3, "Harry Potter and the Curse of the User-entered Data", "J. K. Rowling",
         is_openlibrary=True, other = ["\'harry potter and the curse of user entered data\'", "jk rowling"])

b2 = Book(21, 2, "Harry Potter and the Prisoner of OpenLibrary API", "J. K. Rowling", is_openlibrary=True)

b3 = Book(1, 1, "hary porter and the something something ajds;lkhfdsds", "jk rowling ;D", is_openlibrary=False)

b1.rep()

(3, 3, 'Harry Potter and the Curse of the User-entered Data', 'J. K. Rowling')

### The Cluster object
* count - total number of times the books in that cluster were read
* readers - total number of entries for a book in the cluster (some entries report reading a book multiple times)
* title - the title of the cluster (which is, in practice, the title of one of the most popular books in the cluster)
* author - author of the cluster
* contents and o_contents
    * contents contains user-entered information
    * o_contents contains information from openlibrary search results.
    * both are lists of books

In [12]:
#general functions used in the cluster object
 
def sort_booklist(l):
    #sort booklist by count or # readers, most popular first
    if ORDER_BOOKS_BY_READERS: key = (lambda b: b.readers)
    else: key = (lambda b: b.count)
    return sorted(l, reverse=True, key=key)

def rep_booklist(l):
    #printable string representation of a booklist
    m = []
    for book in l:
        m.append(book.rep())
    return m

In [13]:
class Cluster:
    
    def __init__(self, book):
        self.count = book.count
        self.readers = book.readers
        self.title = book.title
        self.author =  book.author
        
        self.contents = []
        self.o_contents = []
        
        if book.is_openlibrary:
            self.o_contents.append(book)
        else:
            self.contents.append(book)
        
    def add_book(self, book):
                    
        self.count = self.count + book.count
        self.readers = self.readers + book.readers
        
        if book.is_openlibrary:
            self.o_contents.append(book)
        else:
            self.contents.append(book)
        
    def sort_contents(self):
        self.contents = sort_booklist(self.contents)
        self.o_contents = sort_booklist(self.o_contents)
        
    def rename(self):
        #resets title and author of cluster to title and author of most-read entry
        #with preference given to openlibrary entries.
        self.sort_contents()
        
        if self.o_contents == []: contents_to_use = self.contents
        else: contents_to_use = self.o_contents
        if len(contents_to_use) <= 1: return

        self.title = contents_to_use[0].title
        
        for book in contents_to_use:
            if book.author != "":
                self.author = book.author
                break    
        
    def engulf(self, c):
        #take all the books from c's contents and add them to this cluster's contents, updating relevatn counts
        
        self.count = self.count + c.get_count()
        self.readers = self.readers + c.get_readers()
        
        (c_o_contents, c_contents) = c.get_contents()
        self.o_contents = self.o_contents + c_o_contents
        self.contents = self.contents + c_contents
        
        self.rename()
                
    def key(self):
        return make_key(self.title, self.author)
        
    def get_count(self):
        return self.count
    
    def get_readers(self):
        return self.readers
        
    def get_title(self):
        return self.title
        
    def get_author(self):
        return self.author
    
    def get_contents(self):
        return (self.o_contents, self.contents)
    
    def get_contents_list(self):
        return self.o_contents + self.contents
    
    def rep(self, v):
        #returns a printable version of the cluster for debugging
        #make sure to call self.rename() or self.sort_contents() before calling this so you get sorted stuff.
        if v:
            return(self.count,
                   self.readers,
                   self.title, 
                   self.author,
                   rep_booklist(self.o_contents),
                   rep_booklist(self.contents))
        else:
            return(self.count,
                   self.readers,
                   self.title, 
                   self.author)

In [14]:
c = Cluster(b1)
c.rep(True)

(3,
 3,
 'Harry Potter and the Curse of the User-entered Data',
 'J. K. Rowling',
 [(3,
   3,
   'Harry Potter and the Curse of the User-entered Data',
   'J. K. Rowling')],
 [])

In [15]:
c.add_book(b2)
c.add_book(b3)
c.rename()
c.rep(True)

(25,
 6,
 'Harry Potter and the Curse of the User-entered Data',
 'J. K. Rowling',
 [(3,
   3,
   'Harry Potter and the Curse of the User-entered Data',
   'J. K. Rowling'),
  (21,
   2,
   'Harry Potter and the Prisoner of OpenLibrary API',
   'J. K. Rowling')],
 [(1,
   1,
   'hary porter and the something something ajds;lkhfdsds',
   'jk rowling ;D')])

### Matching functions

These functions use the FuzzyWuzzy module to determine whether a book is a good match for a cluster.

In [17]:
def get_title_confidence(title1, title2):
    return fuzz.token_set_ratio(title1, title2)

def get_author_confidence(a1, a2):
    return fuzz.token_set_ratio(a1, a2)

def first_i_tokens(s, i):
    try:
        return " ".join(s.split(" ")[:i])
    except IndexError:
        #presumably there werent enough entries in the string_list
        return s
    return s

def get_partial_title_confidence(key1, key2):
    #the reason for this function is that we want to match titles such as 
    
    ptc = 0
    for i in [2,3,4]:
        p1 = first_i_tokens(key1, i)
        p2 = first_i_tokens(key2, i)
        ptc = max(ptc, fuzz.token_set_ratio(p1, p2))
        
    return ptc

def is_match_book(b, c, d):
    return is_match(b.title, b.author, c, d)

def is_match(title, author, c, d):
    #c is a cluster
    
    full_title_conf = get_title_confidence(title, c.get_title())
    partial_title_conf = get_partial_title_confidence(title, c.get_title())
    if d: 
        print(" ")
        print("full_title_conf: " + str(full_title_conf))
        print("partial_title_conf: " + str(partial_title_conf))
    
    auth_conf = []
    for cbook in c.get_contents_list():
        if cbook.author != "": auth_conf.append(get_author_confidence(author, cbook.author))
    
    if auth_conf == [] or author == "":
        if d: print("no author")
        return(full_title_conf > FULL_TITLE_CONF_CUT_NO_AUTHOR
            or partial_title_conf > PARTIAL_TITLE_CONF_CUT_NO_AUTHOR)
    else:
        avg_auth_conf = np.average(auth_conf)
        if d: print("avg_auth_conf: " + str(avg_auth_conf))
        
        return (
            (
                full_title_conf > FULL_TITLE_CONF_CUT
                or partial_title_conf > PARTIAL_TITLE_CONF_CUT
            )
            and 
            (
                avg_auth_conf > AUTH_CONF_CUT
            )
        )

In [18]:
c = Cluster(b1)
print(is_match_book(b2, c, True))
print(is_match_book(b3, c, True))
b4 = Book(1, 1, "hary porter and the something something ajds;lkhfdsds", "", is_openlibrary=False)
print(is_match_book(b4, c, True))

 
full_title_conf: 69
partial_title_conf: 100
avg_auth_conf: 100.0
True
 
full_title_conf: 58
partial_title_conf: 92
avg_auth_conf: 87.0
True
 
full_title_conf: 58
partial_title_conf: 92
no author
True


### The FuzzyDict object
The above matching functions use FuzzyWuzzy, which is great for determining whether two books match.  But what if we're given a book and want to find the closest match from a long list of other books, without individually checking every book with fuzzywuzzy?

That's exactly what the fuzzyset module does.  The matching can't be fine-tuned as much as FuzzyWuzzy's, but it is very fast.  The downside is, a FuzzySet is just a set of strings, wheras we would want it to store Clusters.

Enter FuzzyDict!  This object contains both a dictionary (d) and a corresponding fuzzyset (fs).  The set of keys of d will be identical to the set of strings in fs, and each one will be of the form "title ~ author".  Each key's value in d will be the cluster with that title and author.

In [19]:
#general functions used in fuzzydict object
def sort_cluster_list(l):
    if ORDER_BOOKS_BY_READERS: key = (lambda c:c.get_readers())
    else: key = (lambda c:c.get_count())
    return sorted(l, 
                  reverse=True, 
                  key=key
    )

In [20]:
class FuzzyDict:
    
    def initialize(self):
        self.d = dict()
        self.fs = fuzzyset.FuzzySet(gram_size_lower=3, gram_size_upper=4)
    
    def __init__(self):
        self.initialize()
        
    def rename_clusters(self):
        #put clusters into list and attempt to rename each one
        l = self.as_list()
        for c in l:
            c.rename()
        
        #after doing this, we basically have to remake d and fs from
        #scratch, since the keys depend on the cluster title and author,
        #which may have changed.  This feels awkward, but it's necessary,
        #since the alternative is leaving the old cluster titles in fs.
        #when we add new books to our fuzzydict, we want to try to match
        #them to the better, new, titles rather than the old ones.
        self.initialize()
        for cluster in l:
            key = cluster.key()
            self.d[key] = cluster
            self.fs.add(key)
        
    def as_list(self):
        dlist = []
        for value in self.d.values():
            dlist.append(value)
        return dlist
    
    def as_sorted_list(self):
        #possibly sort clusters by number of readers instead?
        return sort_cluster_list(self.as_list())
        
    def rep(self, limit, v):
        l = list(map(
            (lambda c:c.rep(v)),
            self.as_sorted_list()
        ))
        try:
            return l[:limit]
        except IndexError:
            pass
        return l
    
       
    def add_new_cluster(self, book):
        key = book.key()
        self.d[key] = Cluster(book)
        self.fs.add(key)

    def add_book(self, book, d):
        #get best match for entered book from the fuzzyset
        #use fuzzywuzzy to make sure it is a close match
        #if so, add book to cluster
        #if not, make new cluster from book
        
        proposed_key = book.key()
        matches = self.fs.get(proposed_key)
        
        if matches == None or matches == []:
            if d: print("no match returned from fuzzyset")  
            self.add_new_cluster(book)
        else:
            (confidence, match_key) = matches[0]
            
            if d: 
                print(" ")
                print("proposed: " + proposed_key)
                print("matched: " + match_key)
                print("confidence: " + str(confidence))
                if len(matches) > 1: print("other matches: " + str(matches[1:]))
                
            match_cluster = self.d[match_key]

            if confidence > .4 and is_match_book(book, match_cluster, d):
                
                if d: print("adding to cluster")
                match_cluster.add_book(book)
                
            else: 
                if d: print("no match")     
                self.add_new_cluster(book)

## Phase 1

Phase 1 will only involve the openlibrary matched books (dfm).  Unlike the books in dfu, these are (mostly) garuateed to be spelled correctly and such.  Cluster together similar books and rename the clusters.

In [22]:
fd = FuzzyDict()
for i, row in dfm.iterrows():
    fd.add_book(
        Book(
            row.num_times_read,
            row.num_readers,
            row.matched_title, 
            row.matched_author, 
            is_openlibrary=True,
            other=(row.title, row.author)
        ),
        d=DEBUG_FUZZY_MATCHING
    )
fd.rename_clusters()
fd.rep(limit=10, v=True)

[(412,
  378,
  "harry potter and the sorcerer's stone",
  'j. k. rowling',
  [(99, 87, "harry potter and the sorcerer's stone", 'j. k. rowling', True),
   (60, 59, 'harry potter and the chamber of secrets', 'j. k. rowling', True),
   (61, 54, 'harry potter and the goblet of fire', 'j. k. rowling', True),
   (48,
    44,
    'harry potter and the order of the phoenix',
    'j. k. rowling',
    True),
   (42, 40, 'harry potter and the prisoner of azkaban', 'j. k. rowling', True),
   (39, 38, 'harry potter and the half-blood prince', 'j. k. rowling', True),
   (38, 31, 'harry potter and the deathly hallows', 'j. k. rowling', True),
   (23, 23, 'harry potter and the cursed child', 'j. k. rowling', True),
   (1, 1, "harry potter and the philosopher's stone", 'j. k. rowling', True),
   (1, 1, 'harry potter, tome 3', 'j. k. rowling', True)],
  []),
 (199,
  141,
  'if you give a mouse a cookie',
  'laura numeroff',
  [(70, 39, 'if you give a mouse a cookie', 'laura numeroff', True),
   (20, 

## Phase 2

Try to add the unmatched books to already created clusters, or crete new clusters if unable to do so.

In [23]:
for i, row in dfu.iterrows():
    fd.add_book(
        Book(
            row.num_times_read,
            row.num_readers,
            row.title, 
            row.author, 
            is_openlibrary=False
        ),
        d=DEBUG_FUZZY_MATCHING
    )
fd.rename_clusters()
fd.rep(limit=10, v=True)

[(412,
  378,
  "harry potter and the sorcerer's stone",
  'j. k. rowling',
  [(99, 87, "harry potter and the sorcerer's stone", 'j. k. rowling', True),
   (60, 59, 'harry potter and the chamber of secrets', 'j. k. rowling', True),
   (61, 54, 'harry potter and the goblet of fire', 'j. k. rowling', True),
   (48,
    44,
    'harry potter and the order of the phoenix',
    'j. k. rowling',
    True),
   (42, 40, 'harry potter and the prisoner of azkaban', 'j. k. rowling', True),
   (39, 38, 'harry potter and the half-blood prince', 'j. k. rowling', True),
   (38, 31, 'harry potter and the deathly hallows', 'j. k. rowling', True),
   (23, 23, 'harry potter and the cursed child', 'j. k. rowling', True),
   (1, 1, "harry potter and the philosopher's stone", 'j. k. rowling', True),
   (1, 1, 'harry potter, tome 3', 'j. k. rowling', True)],
  []),
 (160,
  156,
  'diary of a wimpy kid',
  'jeff kinney',
  [(34, 34, 'diary of a wimpy kid', 'jeff kinney', True),
   (5, 5, 'diary of a wimpy ki

## Phase 3

FuzzySet is really fast, but there are some matches it just can't catch.  After doing all the clustering we can with fuzzyset, we'll fill in the gaps using the much slower method of iterating through each pair of clusters and comaparing them with fuzzywuzzy, and merging the clusters that match using the c.engulf() function.

This step would be far too slow if run on all books, so cutoffs on the number of iterations are included.

In [25]:
l = fd.as_sorted_list()
i=0
loops = 0
maxloops = 10000
while i < min(len(l), 1000) and loops < maxloops:

    j=i+1
    while j < min(len(l), 1000) and loops < maxloops:
        
        if is_match(l[i].get_title(), l[i].get_author(),
                    l[j],
                    False):
            #engulf cluster j into cluster i
            #then remove cluster j from the list
            #dont increment j as next cluster will now be in jth position
            if DEBUG_FUZZY_MATCHING:
                print(" ")
                print("merging clusters:")
                print(" ")
                print(repr(l[i].rep(v=False)))
                print(" ")
                print(repr(l[j].rep(v=False)))
                print(" ")
            l[i].engulf(l[j])
            l.pop(j)
        else:
            j = j+1
        
        loops = loops + 1
    
    i = i+1

l = sort_cluster_list(l)

 
merging clusters:
 
(289, 195, 'curious george', 'h. a. rey')
 
(58, 48, 'curious george goes to a chocolate factory', 'h. a. rey')
 
 
merging clusters:
 
(347, 243, 'curious george', 'h. a. rey')
 
(49, 32, 'curious george at the baseball game', 'h.a. and margret rey')
 
 
merging clusters:
 
(396, 275, 'curious george', 'h. a. rey')
 
(30, 24, 'curious george flies a kite (curious george)', 'margret rey, h. a. rey')
 
 
merging clusters:
 
(426, 299, 'curious george', 'h. a. rey')
 
(24, 22, 'curious george goes to a costume party', 'margaret and h. a. rey')
 
 
merging clusters:
 
(450, 321, 'curious george', 'h. a. rey')
 
(8, 6, "curious george: plumber's helper", 'h.a. rey')
 
 
merging clusters:
 
(142, 138, 'captain underpants', 'dav pilkey')
 
(47, 47, 'captain underpants', 'dav pilkey')
 
 
merging clusters:
 
(152, 135, 'fancy nancy', "jane o'connor")
 
(12, 12, 'fancy nancy best reading buddies', '')
 
 
merging clusters:
 
(164, 147, 'fancy nancy', "jane o'connor")
 
(9

 
 
merging clusters:
 
(11, 10, 'and to think that i saw it on mulberry street', 'dr. seuss')
 
(4, 3, 'bartholomew and the oobleck', 'dr. seuss')
 
 
merging clusters:
 
(10, 10, 'feathers and hair: what animals wear', 'jennifer ward')
 
(4, 4, 'animal babies in towns and cities', 'jennifer schofield')
 
 
merging clusters:
 
(11, 9, 'horns to toes and in between', 'sandra boynton')
 
(6, 4, 'horns to toes and in between', 'sandra boynton')
 
 
merging clusters:
 
(9, 9, 'd.w. goes to preschool', 'marc tolon brown')
 
(3, 3, "d.w.'s lost blankie", 'marc tolon brown')
 
 
merging clusters:
 
(11, 9, 'animals =', 'eric carle')
 
(6, 4, 'animals =', 'eric carle')
 
 
merging clusters:
 
(8, 8, 'cooking light', 'heather averett')
 
(4, 4, 'cooking light magazine', '')
 
 
merging clusters:
 
(8, 8, 'i am too absolutely small for school', 'lauren child')
 
(4, 4, 'absolutely one thing: featuring charlie and lola', 'lauren child')
 
 
merging clusters:
 
(8, 8, '5minute disney pixar storie

In [26]:
replist = []
for c in l: replist.append(c.rep(v=True))
    
replist[:10]

[(412,
  378,
  "harry potter and the sorcerer's stone",
  'j. k. rowling',
  [(99, 87, "harry potter and the sorcerer's stone", 'j. k. rowling', True),
   (60, 59, 'harry potter and the chamber of secrets', 'j. k. rowling', True),
   (61, 54, 'harry potter and the goblet of fire', 'j. k. rowling', True),
   (48,
    44,
    'harry potter and the order of the phoenix',
    'j. k. rowling',
    True),
   (42, 40, 'harry potter and the prisoner of azkaban', 'j. k. rowling', True),
   (39, 38, 'harry potter and the half-blood prince', 'j. k. rowling', True),
   (38, 31, 'harry potter and the deathly hallows', 'j. k. rowling', True),
   (23, 23, 'harry potter and the cursed child', 'j. k. rowling', True),
   (1, 1, "harry potter and the philosopher's stone", 'j. k. rowling', True),
   (1, 1, 'harry potter, tome 3', 'j. k. rowling', True)],
  []),
 (458,
  327,
  'curious george',
  'h. a. rey',
  [(67, 18, 'curious george', 'h. a. rey', True),
   (17, 17, 'curious george goes to a chocolat

### Exporting the Data
Now that we're finished clustering the books, it's time to export the results to an accessible format.  We'll put them in two dataframes, cluster_df and book_df, each of which will be exported to an excel file.  

cluster_df will contain only the title, author, and count of the cluster.  If users wish to see the books contained in a particular cluster, they can search for that cluster id in the "books" file.

In [27]:
counts=[]
readers=[]
titles=[]
authors=[]

cluster_ids=[]
book_counts=[]
book_readers=[]
book_titles=[]
book_authors=[]
is_openlibrarys=[]

cluster_id = 0
for c in l:
    counts.append(c.get_count())
    readers.append(c.get_readers())
    titles.append(c.get_title())
    authors.append(c.get_author())
    
    contents = c.get_contents_list()
    for book in contents:
        cluster_ids.append(cluster_id)
        book_counts.append(book.count)
        book_readers.append(book.readers)
        book_titles.append(book.title)
        book_authors.append(book.author)
        is_openlibrarys.append(book.is_openlibrary)
    cluster_id = cluster_id + 1

In [35]:
cluster_df = pd.DataFrame(collections.OrderedDict([
    ('count', counts),
    ('readers', readers),
    ('title', titles),
    ('author', authors)
     ]))
cluster_df[:10]

Unnamed: 0,count,readers,title,author
0,412,378,harry potter and the sorcerer's stone,j. k. rowling
1,458,327,curious george,h. a. rey
2,189,185,captain underpants,dav pilkey
3,183,165,fancy nancy,jane o'connor
4,160,156,diary of a wimpy kid,jeff kinney
5,199,141,if you give a mouse a cookie,laura numeroff
6,114,105,goodnight moon,margaret wise brown
7,117,92,green eggs and ham,dr. seuss
8,166,89,chicka chicka boom boom,"martin, bill"
9,118,83,i am going!,mo willems


In [36]:
book_df = pd.DataFrame(collections.OrderedDict([
    ('cluster_id', cluster_ids),
    ('count', book_counts),
    ('readers', book_readers),
    ('title', book_titles),
    ('author', book_authors),
    ('is_openlibrary', is_openlibrarys)
     ]))
book_df[:10]

Unnamed: 0,cluster_id,count,readers,title,author,is_openlibrary
0,0,99,87,harry potter and the sorcerer's stone,j. k. rowling,True
1,0,60,59,harry potter and the chamber of secrets,j. k. rowling,True
2,0,61,54,harry potter and the goblet of fire,j. k. rowling,True
3,0,48,44,harry potter and the order of the phoenix,j. k. rowling,True
4,0,42,40,harry potter and the prisoner of azkaban,j. k. rowling,True
5,0,39,38,harry potter and the half-blood prince,j. k. rowling,True
6,0,38,31,harry potter and the deathly hallows,j. k. rowling,True
7,0,23,23,harry potter and the cursed child,j. k. rowling,True
8,0,1,1,harry potter and the philosopher's stone,j. k. rowling,True
9,0,1,1,"harry potter, tome 3",j. k. rowling,True


In [None]:
pandas_import_and_export.to_csv(cluster_df, "cluster_df")
pandas_import_and_export.to_csv(book_df, "book_df")