In [5]:
isbn = 9780618574940

In [6]:
from os import listdir
from os.path import isfile, join
import sys
sys.path.append("/Users/andyreagan/tools/python")
from kitchentable.dogtoys import *
from json import loads
from re import findall,UNICODE
from labMTsimple.labMTsimple.speedy import LabMT
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
class Book_class(object):
    '''Book class to handle loading the calibre expanded epub format.
    
    Store all of the word lists, etc, in one place.'''
    
    def load_all(self,isbn):
        self.files = listdir("data/kindle/kindle-015/"+str(isbn))
        self.txtfiles = listdir("data/kindle/kindle-015/"+str(isbn)+"/txt")        

        print("a sample of the text files:")
        print(self.txtfiles[:10])
        f = open("data/kindle/kindle-015/"+str(isbn)+"/meta.json","r")
        self.metadata = loads(f.read())
        f.close()
        print("this is the metadata:")
        print(self.metadata)
        self.rawtext_by_chapter = []
        for fname in self.txtfiles:
            f = open("data/kindle/kindle-015/"+str(isbn)+"/txt/"+fname,"r")
            self.rawtext_by_chapter.append(f.read())
            f.close()
        self.word_lists_by_chapter = [listify(t) for t in self.rawtext_by_chapter]
        self.chapter_ends = np.cumsum(list(map(len,self.word_lists_by_chapter)))
        # add a 0 to the start, clip (to get the starts)
        # could just move the above array around too...
        self.chapter_beginnings = np.cumsum([0]+list(map(len,self.word_lists_by_chapter[:-1])))
        self.chapter_centers = (self.chapter_ends+self.chapter_beginnings)/2
#         print(list(map(len,self.word_lists_by_chapter)))
#         print(self.chapter_ends)
#         print(self.chapter_beginnings)
#         print(self.chapter_centers)
#         print(len(self.chapter_ends))
#         print(len(self.word_lists_by_chapter))
        
    def chop(self,my_senti_dict,min_size=1000): #,save=False,outfile=""):
        """Take long piece of text and generate the sentiment time series.

        use save parameter to write timeseries to a file."""
        print("splitting the book into chunks of minimum size {}".format(min_size))
        
        self.all_words = " ".join(self.rawtext_by_chapter)
        self.all_word_list = listify(self.all_words)
        
        self.all_fvec = []

        for i in range(int(np.floor(len(self.all_word_list)/min_size))):
            chunk = ""
            if i == int(np.floor(len(self.all_word_list)/min_size))-1:
                # take the rest
                print('last chunk')
                print('getting words ' + str(i*min_size) + ' through ' + str(len(self.all_word_list)-1))
                for j in range(i*min_size,len(self.all_word_list)-1):
                    chunk += self.all_word_list[j]+" "
            else:
                print('getting words ' + str(i*min_size) + ' through ' + str((i+1)*min_size))
                for j in range(i*min_size,(i+1)*min_size):
                    chunk += self.all_word_list[j]+" "
                # print(chunk[0:10])

            chunk_words = listify(chunk)
            chunk_dict = dict()
            for word in chunk_words:
                if word in chunk_dict:
                    chunk_dict[word] += 1
                else:
                    chunk_dict[word] = 1
            text_fvec = my_senti_dict.wordVecify(chunk_dict)

            # print chunk
            # print 'the valence of {0} part {1} is {2}'.format(rawbook,i,textValence)

            self.all_fvec.append(text_fvec)

        return self.all_fvec
    
    def chopper_sliding(self,my_senti_dict,min_size=10000,num_points=100,stop_val=0.0,return_centers=False):
        """Take long piece of text and generate the sentiment time series.
        We will now slide the window along, rather than make uniform pieces.

        use save parameter to write timeseries to a file."""

        print("splitting the book into {} chunks of minimum size {}".format(num_points,min_size))

        # print("and printing those frequency vectors"

        # initialize timeseries, only thing we're after
        timeseries = [0 for i in range(num_points)]
        all_fvecs = [np.zeros(len(my_senti_dict.scorelist)) for i in range(num_points)]
        window_centers = [0 for i in range(num_points)]

        # how much to jump
        # take one chunk out, and divide by the number of others we want (-1, the one we just took out)
        # take the floor of this as the step, so we may take slightly smaller steps than possible
        step = int(np.floor((len(self.all_word_list)-min_size)/(num_points-1)))
        print("there are "+str(len(self.all_word_list))+" words in the book")
        print("step size "+str(step))

        # do it 99 times
        for i in range(num_points-1):
            window_centers[i] = i*step+(min_size)/2
            # build the whole dict each time (could be a little better about this)
            window_dict = dict()
            # print("using words {} through {}".format(i*step,min_size+i*step))
            for word in self.all_word_list[(i*step):(min_size+i*step)]:
                if word in window_dict:
                    window_dict[word] += 1
                else:
                    window_dict[word] = 1
            text_fvec = my_senti_dict.wordVecify(window_dict)
            stoppedVec = stopper(text_fvec,my_senti_dict.scorelist,my_senti_dict.wordlist,stopVal=stop_val)
            timeseries[i] = np.dot(my_senti_dict.scorelist,stoppedVec)/np.sum(stoppedVec)
            all_fvecs[i] = text_fvec

        # final chunk
        i = num_points-1
        window_centers[i] = i*step+(min_size)/2
        # only difference: go to the end
        # may be 10-100 more words there (we used floor on the step)
        window_dict = dict()
        # print("using words {} through {}".format(i*step,len(all_words)))
        for word in self.all_word_list[(i*step):]:
            if word in window_dict:
                window_dict[word] += 1
            else:
                window_dict[word] = 1
        text_fvec = my_senti_dict.wordVecify(window_dict)
        stoppedVec = stopper(text_fvec,my_senti_dict.scorelist,my_senti_dict.wordlist,stopVal=stop_val)
        timeseries[i] = np.dot(my_senti_dict.scorelist,stoppedVec)/np.sum(stoppedVec)
        all_fvecs[i] = text_fvec

        if return_centers:
            return timeseries,all_fvecs,window_centers

        return timeseries,all_fvecs
        # timeseries = coursegrain(timeseries,points=21)

    #     g = open(outfile,"w")
    #     g.write("{0:.0f}".format(timeseries[0]))
    #     for i in range(1,numPoints):
    #         g.write(",")
    #         g.write("{0:.0f}".format(timeseries[i]))
    #     g.write("\n")

    def __init__(self,isbn):
        self.isbn = isbn
        self.load_all(isbn)
        
    def __str__(self):
        if "title" in self.metadata:
            return self.metadata["title"]
        else:
            return "Book (no title)"

In [4]:
my_book = Book_class(isbn)
print(my_book)

my_LabMT = LabMT(stopVal=0.0)
all_fvecs = my_book.chop(my_LabMT)

c = my_book.rawtext_by_chapter[11]
w = my_book.word_lists_by_chapter[11]
print(c)

FileNotFoundError: [Errno 2] No such file or directory: 'data/kindle/kindle-015/9780618574940'

In [None]:
print(w[:20])

In [None]:
from spacy.en import English
# Model not installed. Please run 'python -m spacy.en.download' to install latest compatible model.
from spacy.parts_of_speech import NAMES

In [None]:
nlp = English()

In [None]:
doc = nlp(c)

In [None]:
# print(doc)

In [None]:
doc.is_parsed

In [None]:
doc.vector

In [None]:
NAMES

In [None]:
len(doc.ents)

In [None]:
doc.ents

In [None]:
for e in doc.ents:
    print(e,e.label_)

In [None]:
doc.ents[0]

In [None]:
type(doc.ents[0])

In [None]:
doc.ents[0].label_

In [None]:
type(doc.ents[0].root)

In [None]:
doc.ents[0].root.text

In [None]:
str(doc.ents[0])

In [None]:
entity_counts = dict()
entity_counts_labels = dict()
for ent in doc.ents:
    if ent.string.rstrip() in entity_counts:
        entity_counts[ent.string.rstrip()] += 1
        entity_counts_labels[ent.string.rstrip()].append(ent.label_)
    else:
        entity_counts[ent.string.rstrip()] = 1
        entity_counts_labels[ent.string.rstrip()] = [ent.label_]

In [None]:
entity_counts_labels

In [None]:
# flatten those counts?
entity_root_counts = dict()
for ent in doc.ents:
    if ent.root.text in entity_root_counts:
        entity_root_counts[ent.root.text] += 1
    else:
        entity_root_counts[ent.root.text] = 1

In [None]:
entity_root_counts

In [None]:
entity_root_counts_flat = [(ent,entity_root_counts[ent]) for ent in entity_root_counts]
entity_counts_flat = [(ent,entity_counts[ent]) for ent in entity_counts]
def dictify(my_list):
    a = dict()
    for b in my_list:
        if b in a:
            a[b] += 1
        else:
            a[b] = 1
    return a
entity_counts_labels_flat = [(ent,len(entity_counts_labels[ent]),dictify(entity_counts_labels[ent])) for ent in entity_counts_labels]

In [None]:
entity_root_counts_ents = [ent for ent in entity_root_counts]

In [None]:
entity_root_counts_flat[:10]

In [None]:
sorted(entity_root_counts_flat,key=lambda name: name[1],reverse=True)[:10]

In [None]:
sorted(entity_counts_flat,key=lambda name: name[1],reverse=True)[:10]

In [None]:
sorted(entity_counts_labels_flat,key=lambda name: name[1],reverse=True)[:10]

the labels aren't helpful....

In [None]:
# store them across the whole book
all_ents = dict()
all_ents_roots = dict()
all_ents_labels = dict()

chapter_dicts = []
for i,chap in enumerate(my_book.rawtext_by_chapter):
    if len(my_book.word_lists_by_chapter[i]) > 1000:
        print("-"*80)
        print("chapter {}".format(i+1))
        print("-"*80)
        doc = nlp(chap)
        print("processing counts")
        entity_counts = dict()
        for ent in doc.ents:
            if ent.string.rstrip() in entity_counts:
                entity_counts[ent.string.rstrip()] += 1
            else:
                entity_counts[ent.string.rstrip()] = 1
            if ent.string.rstrip() in all_ents:
                all_ents[ent.string.rstrip()] += 1
                all_ents_labels[ent.string.rstrip()].append(ent.label_)
            else:
                all_ents[ent.string.rstrip()] = 1
                all_ents_labels[ent.string.rstrip()] = [ent.label_]
        entity_root_counts = dict()
        for ent in doc.ents:
            if ent.root.text.rstrip() in entity_root_counts:
                entity_root_counts[ent.root.text.rstrip()] += 1
            else:
                entity_root_counts[ent.root.text.rstrip()] = 1
            if ent.root.text.rstrip() in all_ents_roots:
                all_ents_roots[ent.root.text.rstrip()] += 1
            else:
                all_ents_roots[ent.root.text.rstrip()] = 1
        entity_root_counts_flat = [(ent,entity_root_counts[ent]) for ent in entity_root_counts]
        entity_counts_flat = [(ent,entity_counts[ent]) for ent in entity_counts]
        print(sorted(entity_root_counts_flat,key=lambda name: name[1],reverse=True)[:10])
        print(sorted(entity_counts_flat,key=lambda name: name[1],reverse=True)[:10])
        chapter_dicts.append(entity_counts)

In [None]:
doc.ents[0].vector

In [None]:
doc.ents[0].lefts

In [None]:
doc.ents[0].rights

In [None]:
doc.ents[0].subtree

In [None]:
doc.ents[0].orth_

In [None]:
doc.ents[0].lemma_

In [None]:
doc.ents[0].string

In [None]:
doc.ents[0].label_

In [None]:
all_entity_root_counts_flat = [(ent,all_ents_roots[ent]) for ent in all_ents_roots]
all_entity_counts_flat = [(ent,all_ents[ent]) for ent in all_ents]
print(sorted(all_entity_root_counts_flat,key=lambda name: name[1],reverse=True)[:30])
print(sorted(all_entity_counts_flat,key=lambda name: name[1],reverse=True)[:30])

In [None]:
all_ents_labels_flat = [[ent,len(all_ents_labels[ent]),dictify(all_ents_labels[ent])] for ent in all_ents_labels]
all_ents_labels_flat_sorted = sorted(all_ents_labels_flat,key=lambda name: name[1],reverse=True)[:30]

In [None]:
all_ents_labels_flat_sorted[:2]

In [None]:
people = []
locations = []
for ent in all_ents_labels_flat_sorted[:100]:
    print(ent)
    p_person = float(ent[2].get("PERSON",0))/ent[1]
    print("P(PERSON) = {0:.4f}".format(p_person))
    p_loc = float(ent[2].get("LOC",0))/ent[1]
    print("P(LOC) = {0:.4f}".format(p_loc))
    p_gpe = float(ent[2].get("GPE",0))/ent[1]
    if (p_loc > .01 or p_gpe > .2) and p_person < .5:
        print("{} is a location".format(ent[0]))
        locations.append(ent[0])
    elif p_person > 0.0:
        print("{} is a person".format(ent[0]))
        people.append(ent[0])

In [None]:
print(people)

In [None]:
print(locations)

In [None]:
all_ents_labels_flat_sorted[0][2].get("E",0)

In [None]:
people_probs = [(p,np.array([float(x.get(p,0)) for x in chapter_dicts])) for p in people]

In [None]:
people_totals = np.zeros(len(chapter_dicts))
for p in people_probs:
    print(p[1])
    people_totals += p[1]

In [None]:
plt.figure(figsize=(17,8))
plt.xkcd()
for i in range(4):
    plt.plot(people_probs[i][1]/people_totals,"-s",label=people[i])
plt.legend()

In [None]:
location_probs = [(loc,np.array([float(x.get(loc,0)) for x in chapter_dicts])) for loc in locations]

In [None]:
loc_totals = np.zeros(len(chapter_dicts))
for p in location_probs:
    loc_totals += p[1]

In [None]:
plt.figure(figsize=(17,8))
plt.xkcd()
for i in range(len(locations)):
    plt.plot(location_probs[i][1]/loc_totals,"-s",label=locations[i])
plt.legend()

In [None]:
loc_array = []
for i in range(len(chapter_dicts)):
    chap_locs = np.array([location_probs[j][1][i] for j in range(len(locations))])
    winner = np.max(chap_locs)
    loc = np.array(locations)[chap_locs == winner]
    print(loc)
    loc_array.append(loc[0])

In [None]:
# build lists of the ranges of each
breaks = [0]
for i in range(len(chapter_dicts)-1):
    if not loc_array[i] == loc_array[i+1]:
        breaks.append(i+1)
breaks.append(len(chapter_dicts))
print(breaks)

In [None]:
cont_locs = [loc_array[i] for i in breaks[:-1]]
print(cont_locs)

In [None]:
plt.figure(figsize=(17,8))
for i in range(len(breaks)):
    plt.plot([breaks[i],breaks[i]],[0,1],"--",color="0.7")
for i in range(len(breaks)-1):
    print((breaks[i]+breaks[i+1])/2.)
    plt.text((breaks[i]+breaks[i+1])/2.,.94,cont_locs[i],color="0.7",horizontalalignment="center",rotation=20)
plt.xlim([breaks[0],breaks[-1]])
print(len(chapter_dicts))

In [None]:
plt.figure(figsize=(17,8))
for i in range(len(breaks)):
    plt.plot([breaks[i],breaks[i]],[0,1],"--",color="0.7")
for i in range(len(breaks)-1):
    plt.text((breaks[i]+breaks[i+1])/2.,.94,cont_locs[i],color="0.7",horizontalalignment="center",rotation=20)
plt.xlim([breaks[0],breaks[-1]])
for i in range(4):
    plt.plot(np.arange(len(chapter_dicts))+0.5,people_probs[i][1]/people_totals,"-s",label=people[i])
plt.legend(loc="center right")
mysavefig("getting-closer.png",folder="media/figures")
mysavefig("getting-closer.pdf",folder="media/figures")