In [80]:
import textract
import os
import pandas as pd
import numpy as np
import re
import nltk
from textblob import TextBlob, Word

from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import wordnet
from collections import Counter, defaultdict
from gensim import corpora, models, matutils
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
epub_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_epub_working/"
txt_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_txt/"
path = '/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/'
folders = ['sci-fi_top','sci-fi_flop','romance_top','romance_flop']

In [3]:
# load profanity file
curses = pd.read_csv(path + 'other/profanity.csv')
curses.drop('Unnamed: 1', inplace=True, axis=1)
bad_words = curses.word.T.tolist()

In [None]:
# load character name file


In [4]:
# function to extract text from epub

def convert_epub_to_text(epub_path, epub_file, txt_path):

    clean_text = ''
    text_name = epub_file.replace(' ','_')[:-4]+'txt'
    
    # extract text from epub
    text = textract.process(epub_path+epub_file,encoding='utf_8')
    
    # trip out the unicode and return characters
    # still working on the \ characters
    #for i in text.split(' '):
    clean_text = text.decode('ascii', 'ignore').replace('\n',' ')
    
    # save as text file
    text_file = open(txt_path+text_name, 'w')
    text_file.write(clean_text)
    text_file.close()
    

In [6]:
# loop through epub files in directory, extract text, save text file in new folder
for epub in os.listdir(epub_path):
    try:
        convert_epub_to_text(epub_path, epub, txt_path)
    except:
        print epub, " failed"

colour_had_all_been_packed_away-libby_oneill.epub  failed
good_breeding-jl_merrow.epub  failed
iastron-james_dunn.epub  failed
star_diary-andreas_ingo.epub  failed
tales_from_inter _space_freight_services-duane_smith.epub  failed
the_edge_of_eternity-mark_holzclaw.epub  failed


In [5]:
# load txt files into dataframe, 
# give each entry a best_selling 1/0 indicator and a sci_fi 1/0 (0=romance) indicator

df = pd.DataFrame()

for folder in folders:
    if folder[-3:]=='top':
        bs = 1
    else:
        bs = 0
    if folder[:3]=='sci':
        sf = 1
    else:
        sf = 0
        
    for text_file in os.listdir(path+folder+'/'):
        full_path = path + folder + '/' + text_file
        if text_file.endswith((".txt")):
            text  = open(full_path, 'r').read()
            temp = pd.DataFrame({
                    'best_seller': bs,
                    'sci_fi': sf,
                    'title': text_file[:-4].replace('_',' ').replace('-',' - '),
                    'body': text.decode('ascii', 'ignore').replace('\n',' ').replace('\r','')}, 
                                index=[0])
            df = pd.concat([df, temp])

In [6]:
# house cleaning
df = df.reset_index()
del df['index']
df.head(2)

Unnamed: 0,best_seller,body,sci_fi,title
0,1,Prologue The sun is always just about to ris...,1,2312 - kim stanley robinson
1,1,How to explain? How to describe? Even the omni...,1,a fire upon the deep - vernor vinge


In [7]:
# check for front and back matter in body
# i have cleaned up the files by using this alert
# remaining issues are intentional usage of terms in the text

check_words = ['acknowledgements','table of contents','about the author', 'appendix', 
               'copyright','isbn','by this author']

for i in range(0,len(df)):
    for word in check_words:
        if word in df.iloc[i,1].lower():
            print df.ix[i,3], ' : ', word
        

a fire upon the deep - vernor vinge  :  appendix
dune - frank herbert  :  appendix
rainbows end - vernor vinge  :  table of contents
rainbows end - vernor vinge  :  appendix
rainbows end - vernor vinge  :  copyright
the algebraist - iain m banks  :  acknowledgements
the algebraist - iain m banks  :  appendix
atlanta nights - travis tea  :  appendix
red planet pioneer - vincent tibbetts  :  copyright
fifty shades of grey - e l james  :  appendix
grey - e l james  :  appendix
outlander voyager - diana galbaldon  :  appendix
the rosie project - graeme simsion  :  table of contents
harder - brenda cooper  :  copyright


In [8]:
# function bank

def avg_sentence_len(text):
    word_counts = []
    tokenizer = RegexpTokenizer(r'\w+')
    sent_detect = PunktSentenceTokenizer()
    sentences = sent_detect.sentences_from_text(text)
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        word_counts.append(len(words))
    avg_word_count = sum(word_counts)/len(word_counts)  
    return avg_word_count

#--------------------------------

def get_token_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return words

def word_count(text):
    words = get_token_words(text)
    return len(words)

def avg_word_len(text):
    letter_counts = []
    words = get_token_words(text)
    for word in words:
        letter_counts.append(len(word))
    avg_word_len = sum(letter_counts)/len(letter_counts)
    return avg_word_len

def profanity_counter(text):
    i=0
    words = get_token_words(text)
    for word in words:
        if word in bad_words:
            i+=1       
    return i

def lex_div(text):
    words = get_token_words(text)
    lexical_diversity = 1.0 * len(set(words)) / len(words)
    return lexical_diversity

#--------------------------------

def to_blob(text):
    blob = TextBlob(text)
    return blob

def assign_polarity(text):
    blob = to_blob(text)
    return blob.sentiment.polarity

def assign_subjectivity(text):
    blob = to_blob(text)
    return blob.sentiment.subjectivity

#---------------------------------

def parse_pos(df,field):
    for i in range(0,len(df)):
        blob = TextBlob(df.ix[i,field])
        tags = blob.tags
        df_tags = pd.DataFrame(tags)
        df_tags = df_tags.groupby([1]).count().reset_index()
        for x in range(0,len(df_tags)):
                df.ix[i, df_tags.ix[x,1] ] = df_tags.ix[x,0]
        df.fillna(0,inplace=True)

In [9]:
# blobs = df['body'].map(to_blob)
# words = df['body'].map(get_token_words)

# create metrics

df['avg_sent_len'] = df['body'].map(avg_sentence_len)
df['word_count'] = df['body'].map(word_count)
df['avg_word_len'] = df['body'].map(avg_word_len)
df['lex_diversity'] = df['body'].map(lex_div)
df['polarity'] = df['body'].map(assign_polarity)
df['subjectivity'] = df['body'].map(assign_subjectivity)
df['profanity'] = df['body'].map(profanity_counter)
df['profane'] = 1. * df['profanity']/df['word_count']


In [58]:
parse_pos(df,'body')

In [12]:
# for i in range(0,len(df)):
#     blob = TextBlob(df.ix[i,'body'])
#     tags = blob.tags
#     df_tags = pd.DataFrame(tags)
#     df_tags = df_tags.groupby([1]).count().reset_index()
#     for x in range(0,len(df_tags)):
#             df.ix[i, df_tags.ix[x,1] ] = df_tags.ix[x,0]
#     df.fillna(0,inplace=True)



In [89]:
#df.head(3) 

In [88]:
#df.describe().T

In [16]:
df.columns

Index([u'best_seller', u'body', u'sci_fi', u'title', u'avg_sent_len',
       u'word_count', u'avg_word_len', u'lex_diversity', u'polarity',
       u'subjectivity', u'profanity', u'profane', u'conj_coord', u'number',
       u'determiner', u'exist_there', u'foreign_word', u'conj_sub_prep',
       u'adj', u'adj_compare', u'adj_sup', u'verb_aux', u'noun', u'noun_prop',
       u'noun_prop_pural', u'noun_plural', u'predeterm', u'pronoun_pers',
       u'pronoun_poss', u'adv', u'adv_compare', u'adv_sup', u'adv_part',
       u'inf_to', u'interject', u'verb_base', u'verb_past', u'verb_ger',
       u'verb_pp', u'verb_sing_pres', u'verb_3rd_sing_pres', u'wh_determ',
       u'wh_pronoun', u'wh_poss', u'wh_adv', u'poss_ending', u'symbol',
       u'list_marker'],
      dtype='object')

In [15]:
df.rename(columns={'CC':'conj_coord','CD':'number', 'DT':'determiner', 'EX':'exist_there',
                  'FW':'foreign_word',  'IN':'conj_sub_prep','JJ':'adj','JJR':'adj_compare',
                 'JJS':'adj_sup','MD':'verb_aux',  'NN':'noun','NNP':'noun_prop',
                'NNPS':'noun_prop_pural',  'NNS':'noun_plural', 'PDT':'predeterm','PRP':'pronoun_pers',
                'PRP$':'pronoun_poss',  'RB':'adv','RBR':'adv_compare','RBS':'adv_sup',
                  'RP':'adv_part', 'TO':'inf_to',  'UH':'interject','VB':'verb_base',
                 'VBD':'verb_past','VBG':'verb_ger','VBN':'verb_pp','VBP':'verb_sing_pres',
                 'VBZ':'verb_3rd_sing_pres','WDT':'wh_determ','WP':'wh_pronoun','WP$':'wh_poss',
                 'WRB':'wh_adv','POS':'poss_ending','SYM':'symbol','LS':'list_marker'}, inplace=True)

In [17]:
df.to_csv(path + 'df.csv')

In [55]:
bs = df[(df['best_seller']==1)]
f = df[(df['best_seller']==0)]
sfbs = df[((df['best_seller']==1) & (df['sci_fi']==1))]
rmbs = df[((df['best_seller']==1) & (df['sci_fi']==0))]
sff = df[((df['best_seller']==0) & (df['sci_fi']==1))]
rmf = df[((df['best_seller']==0) & (df['sci_fi']==0))]

In [84]:
#sfbs.describe().T

In [85]:
#sff.describe().T

In [86]:
#rmbs.describe().T

In [87]:
#rmf.describe().T

In [None]:
stop_words = text.ENGLISH_STOP_WORDS.union(['232'])
char_name_stop = stop_words.union(['awn','muad','dib','hock','seng','nell','pham','nuwen', 'enzo','anaander','mianaai',
                                   'rautha','feyd','willoughby','brandon','john','dashwood','jessica','fang','hiro',
                                  'steele','anastasia','rochester','jennings','ian','middleton','tara','jean','fairfax',
                                  'mcgraw','finkle', 'dearborne','dearbornes','peterby','anne','alice','henry','simon',
                                  'gavin','marco','bruce','catherine','nicky','brent','reverend','bene','gesserit','clef',
                                  'radch','kei',"sjandra",'goodbody','paul','robert','horza','ender','ruby','travis','dar',
                                  'miller','holden','travis','leto','mike','anna','justus','swan','jeff','onor','stilgar',
                                  'sam','jamie','sasha','riley','nerezza','christian','grey'])

In [101]:
# initial feature review to see anything interesting

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.feature_extraction import text 

vect = TfidfVectorizer(ngram_range=(1,1), stop_words=char_name_stop)

summaries_sfbs = "".join(sfbs['body'])
summaries_rmbs = "".join(rmbs['body'])
summaries_sff = "".join(sff['body'])
summaries_rmf = "".join(rmf['body'])
summaries_bs = "".join(bs['body'])
summaries_f = "".join(f['body'])

ngrams_summaries_sfbs = vect.build_analyzer()(summaries_sfbs)
ngrams_summaries_rmbs = vect.build_analyzer()(summaries_rmbs)
ngrams_summaries_sff = vect.build_analyzer()(summaries_sff)
ngrams_summaries_rmf = vect.build_analyzer()(summaries_rmf)
ngrams_summaries_bs = vect.build_analyzer()(summaries_bs)
ngrams_summaries_f = vect.build_analyzer()(summaries_f)

In [102]:
Counter(ngrams_summaries_bs).most_common(20)

[(u'said', 27773),
 (u'like', 14232),
 (u'just', 11302),
 (u'time', 9944),
 (u'know', 9796),
 (u'way', 7608),
 (u'did', 7151),
 (u'im', 6831),
 (u'eyes', 6654),
 (u'think', 6555),
 (u'hand', 6430),
 (u'head', 6340),
 (u'dont', 6281),
 (u'thought', 6238),
 (u'going', 6090),
 (u'looked', 6015),
 (u'right', 5908),
 (u'didnt', 5700),
 (u'away', 5677),
 (u'little', 5672)]

In [103]:
Counter(ngrams_summaries_f).most_common(20)

[(u'said', 10446),
 (u'like', 9050),
 (u'just', 7450),
 (u'know', 6576),
 (u'im', 6150),
 (u'dont', 5846),
 (u'time', 5504),
 (u'way', 4519),
 (u'didnt', 4504),
 (u'did', 4246),
 (u'eyes', 4099),
 (u'think', 3987),
 (u'head', 3967),
 (u'right', 3858),
 (u'going', 3822),
 (u'looked', 3769),
 (u'want', 3730),
 (u'hand', 3640),
 (u'away', 3450),
 (u'look', 3264)]

In [95]:
# sci-fi books over-use character names
# also a long time ago, far far away
Counter(ngrams_summaries_sfbs).most_common(20)

[(u'said', 16618),
 (u'like', 8113),
 (u'just', 6633),
 (u'time', 5624),
 (u'know', 4966),
 (u'way', 4626),
 (u'people', 4198),
 (u'did', 3921),
 (u'looked', 3706),
 (u'little', 3406),
 (u'thought', 3386),
 (u'man', 3173),
 (u'think', 3159),
 (u'long', 3156),
 (u'away', 3147),
 (u'right', 3125),
 (u'going', 2930),
 (u'old', 2783),
 (u'got', 2766),
 (u'head', 2757)]

In [96]:
Counter(ngrams_summaries_sff).most_common(20)

[(u'said', 7156),
 (u'like', 4863),
 (u'just', 4056),
 (u'know', 3395),
 (u'time', 3181),
 (u'way', 2772),
 (u'dont', 2669),
 (u'did', 2428),
 (u'looked', 2367),
 (u'people', 2236),
 (u'im', 2205),
 (u'didnt', 2116),
 (u'think', 2110),
 (u'going', 2012),
 (u'right', 1962),
 (u'head', 1927),
 (u'eyes', 1868),
 (u'away', 1866),
 (u'little', 1825),
 (u'hand', 1758)]

In [97]:
# romance titles have some classics --> deep breath, closed eyes, emotional gestures
Counter(ngrams_summaries_rmbs).most_common(20)

[(u'said', 11155),
 (u'like', 6119),
 (u'im', 5455),
 (u'know', 4830),
 (u'just', 4669),
 (u'dont', 4507),
 (u'time', 4320),
 (u'eyes', 4170),
 (u'hand', 3890),
 (u'didnt', 3754),
 (u'head', 3583),
 (u'want', 3551),
 (u'think', 3396),
 (u'did', 3230),
 (u'going', 3160),
 (u'face', 3118),
 (u'good', 3006),
 (u'way', 2982),
 (u'youre', 2914),
 (u'thought', 2852)]

In [98]:
Counter(ngrams_summaries_rmf).most_common(20)

[(u'like', 4187),
 (u'im', 3945),
 (u'just', 3394),
 (u'said', 3290),
 (u'know', 3181),
 (u'dont', 3177),
 (u'didnt', 2388),
 (u'want', 2349),
 (u'time', 2323),
 (u'eyes', 2231),
 (u'head', 2040),
 (u'right', 1896),
 (u'hand', 1882),
 (u'think', 1877),
 (u'did', 1818),
 (u'going', 1810),
 (u'way', 1747),
 (u'good', 1653),
 (u'face', 1622),
 (u'shed', 1589)]

In [36]:
vect = TfidfVectorizer(ngram_range=(2,3), stop_words=char_name_stop)

summaries_sfbs = "".join(sfbs['body'])
summaries_rmbs = "".join(rmbs['body'])
summaries_sff = "".join(sff['body'])
summaries_rmf = "".join(rmf['body'])

ngrams_summaries_sfbs = vect.build_analyzer()(summaries_sfbs)
ngrams_summaries_rmbs = vect.build_analyzer()(summaries_rmbs)
ngrams_summaries_sff = vect.build_analyzer()(summaries_sff)
ngrams_summaries_rmf = vect.build_analyzer()(summaries_rmf)

In [40]:
Counter(ngrams_summaries_sfbs).most_common(20)

[(u'shook head', 456),
 (u'long time', 314),
 (u'old man', 311),
 (u'years ago', 276),
 (u'white shirts', 241),
 (u'thousand years', 206),
 (u'far away', 184),
 (u'cloud ark', 183),
 (u'hong kong', 140),
 (u'im sure', 136),
 (u'half hour', 132),
 (u'im going', 128),
 (u'make sure', 126),
 (u'long ago', 126),
 (u'years old', 124),
 (u'shakes head', 124),
 (u'right hand', 122),
 (u'closed eyes', 122),
 (u'time time', 119),
 (u'half dozen', 116)]

In [42]:
Counter(ngrams_summaries_sff).most_common(20)

[(u'shook head', 317),
 (u'sai ias', 295),
 (u'im sorry', 192),
 (u'im sure', 184),
 (u'im going', 156),
 (u'long time', 145),
 (u'make sure', 137),
 (u'deep breath', 115),
 (u'hell ship', 114),
 (u'old man', 106),
 (u'years ago', 105),
 (u'took deep', 102),
 (u'purple man', 100),
 (u'youre going', 98),
 (u'closed eyes', 97),
 (u'explorer 410', 96),
 (u'took deep breath', 93),
 (u'far away', 92),
 (u'diamond deep', 83),
 (u'purple men', 80)]

In [37]:
# this is funnier
Counter(ngrams_summaries_rmbs).most_common(30)

[(u'shook head', 694),
 (u'im going', 601),
 (u'im sorry', 355),
 (u'im sure', 296),
 (u'deep breath', 276),
 (u'long time', 240),
 (u'youre going', 240),
 (u'closed eyes', 218),
 (u'blue eyes', 185),
 (u'new york', 177),
 (u'took deep', 174),
 (u'wasnt sure', 162),
 (u'living room', 161),
 (u'took deep breath', 155),
 (u'shaking head', 145),
 (u'ive got', 143),
 (u'open door', 132),
 (u'im glad', 125),
 (u'oh god', 123),
 (u'little bit', 123),
 (u'ive seen', 120),
 (u'years ago', 115),
 (u'young man', 114),
 (u'make sure', 114),
 (u'wasnt going', 113),
 (u'eyes closed', 113),
 (u'oh aye', 112),
 (u'youve got', 110),
 (u'oh yes', 108),
 (u'turned away', 107)]

In [38]:
Counter(ngrams_summaries_rmf).most_common(30)

[(u'shook head', 325),
 (u'im sorry', 303),
 (u'im going', 288),
 (u'im sure', 250),
 (u'mrs goodbody', 151),
 (u'living room', 147),
 (u'ive got', 141),
 (u'closed eyes', 138),
 (u'new york', 137),
 (u'parking lot', 133),
 (u'high school', 125),
 (u'im just', 117),
 (u'make sure', 112),
 (u'wasnt sure', 110),
 (u'long time', 107),
 (u'opened door', 102),
 (u'years ago', 100),
 (u'oh god', 98),
 (u'youre going', 97),
 (u'deep breath', 92),
 (u'shakes head', 92),
 (u'youve got', 88),
 (u'think im', 86),
 (u'im glad', 85),
 (u'rolled eyes', 81),
 (u'youre right', 77),
 (u'held hand', 75),
 (u'just want', 73),
 (u'pulled away', 73),
 (u'ive seen', 71)]

# Models

In [18]:
vect = CountVectorizer(stop_words=char_name_stop)
X = vect.fit_transform(df['body'])

In [73]:
vect_sf1 = CountVectorizer(stop_words=char_name_stop)
X_sf1 = vect_sf1.fit_transform(sfbs['body'])
vocab_sf1 = {v: k for k, v in vect_sf1.vocabulary_.iteritems()}

lda_sf1 = models.LdaModel(
    matutils.Sparse2Corpus(X_sf1, documents_columns=False),
    num_topics  =  5,
    passes      =  5,
    id2word     =  vocab_sf1
)
lda_sf1.print_topics(num_topics=5, num_words=3)

[(0, u'0.005*time + 0.004*people + 0.004*just'),
 (1, u'0.006*just + 0.004*looked + 0.004*time'),
 (2, u'0.006*just + 0.005*says + 0.004*time'),
 (3, u'0.004*thought + 0.004*man + 0.003*way'),
 (4, u'0.005*just + 0.005*time + 0.003*way')]

In [75]:
vect_sf0 = CountVectorizer(stop_words=char_name_stop)
X_sf0 = vect_sf0.fit_transform(sff['body'])
vocab_sf0 = {v: k for k, v in vect_sf0.vocabulary_.iteritems()}

lda_sf0 = models.LdaModel(
    matutils.Sparse2Corpus(X_sf0, documents_columns=False),
    num_topics  =  5,
    passes      =  5,
    id2word     =  vocab_sf0
)
lda_sf0.print_topics(num_topics=5, num_words=3)


[(0, u'0.005*time + 0.004*just + 0.004*way'),
 (1, u'0.006*people + 0.005*just + 0.005*time'),
 (2, u'0.007*just + 0.005*time + 0.004*way'),
 (3, u'0.007*just + 0.005*im + 0.005*time'),
 (4, u'0.004*just + 0.004*looked + 0.004*time')]

In [79]:
vect_rm1 = CountVectorizer(stop_words=char_name_stop)
X_rm1 = vect_rm1.fit_transform(rmbs['body'])
vocab_rm1 = {v: k for k, v in vect_rm1.vocabulary_.iteritems()}

lda_rm1 = models.LdaModel(
    matutils.Sparse2Corpus(X_rm1, documents_columns=False),
    num_topics  =  5,
    passes      =  5,
    id2word     =  vocab_rm1
)
lda_rm1.print_topics(num_topics=5, num_words=3)

[(0, u'0.006*time + 0.005*rosie + 0.005*im'),
 (1, u'0.008*im + 0.007*just + 0.006*time'),
 (2, u'0.009*ye + 0.004*hand + 0.004*head'),
 (3, u'0.006*hand + 0.006*eyes + 0.005*ye'),
 (4, u'0.010*im + 0.007*want + 0.006*eyes')]

In [90]:
vect_rm0 = CountVectorizer(stop_words=char_name_stop)
X_rm0 = vect_rm0.fit_transform(rmf['body'])
vocab_rm0 = {v: k for k, v in vect_rm1.vocabulary_.iteritems()}

lda_rm0 = models.LdaModel(
    matutils.Sparse2Corpus(X_rm0, documents_columns=False),
    num_topics  =  5,
    passes      =  5,
    id2word     =  vocab_rm0
)
lda_rm0.print_topics(num_topics=5, num_words=3)

[(0, u'0.008*beltane + 0.007*expressions + 0.005*delightfulbut'),
 (1, u'0.009*moths + 0.009*beenbut + 0.007*countermoves'),
 (2, u'0.010*electronic + 0.007*expressions + 0.006*particularly'),
 (3, u'0.009*electronic + 0.005*expressions + 0.005*leastout'),
 (4, u'0.007*expressions + 0.006*electronic + 0.005*nerezza')]