In [151]:
import os
import re
import nltk
import textract
import pandas as pd
import numpy as np
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import PunktSentenceTokenizer




In [123]:
epub_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_epub_working/"
txt_path = "/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/_txt/"
path = '/Users/katbishop/Desktop/DSI-SF2-bishopkd/projects/capstone/data/'
folders = ['sci-fi_top','sci-fi_flop','romance_top','romance_flop']

In [152]:
# function to extract text from epub
def convert_epub_to_text(epub_path, epub_file, txt_path):
    clean_text = ''
    text_name = epub_file.replace(' ','_')[:-4]+'txt' #clean up filename and change file extention
    
    text = textract.process(epub_path+epub_file,encoding='utf_8') #extract text from epub
    clean_text = text.decode('ascii', 'ignore').replace('\n',' ') #trip out the unicode and return characters

    text_file = open(txt_path+text_name, 'w') #save as text file
    text_file.write(clean_text)
    text_file.close()   

In [153]:
# loop through files in directory, convert file, save file in new folder
for epub in os.listdir(epub_path):
    try:
        convert_epub_to_text(epub_path, epub, txt_path)
    except:
        print epub, "failed"

In [159]:
# load txt files into dataframe, 
# give each entry a best_selling 1/0 entry and a sci_fi 1/0 (0=romance) indicator
df = pd.DataFrame()

for folder in folders:
    if folder[-3:]=='top':
        bs = 1
    else:
        bs = 0
    if folder[:3]=='sci':
        sf = 1
    else:
        sf = 0
        
    for text_file in os.listdir(path+folder+'/'):
        full_path = path + folder + '/' + text_file
        if text_file.endswith((".txt")):
            text  = open(full_path, 'r').read()
            temp = pd.DataFrame({
                    'best_seller': bs,
                    'sci_fi': sf,
                    'title': text_file[:-4].replace('_',' ').replace('-',' - '),
                    'body': text.decode('ascii', 'ignore').replace('\n',' ').replace('\r','')}, 
                                index=[0])
            df = pd.concat([df, temp])
            
df = df.reset_index() # because index=[0]
del df['index']

In [161]:
# check for front and back matter in body
# i have cleaned up the files by using this alert
# remaining issues are intentional usage in the body

check_words = ['acknowledgements','table of contents','about the author', 'appendix', 
               'copyright','isbn','by this author', 'chapter']

for i in range(0,len(df)):
    for word in check_words:
        if word in df.iloc[i,1].lower():
            print df.ix[i,3], ' : ', word        

a fire upon the deep - vernor vinge  :  appendix
dune - frank herbert  :  appendix
rainbows end - vernor vinge  :  table of contents
rainbows end - vernor vinge  :  appendix
rainbows end - vernor vinge  :  copyright
the algebraist - iain m banks  :  acknowledgements
the algebraist - iain m banks  :  appendix
atlanta nights - travis tea  :  appendix
hell ship - philip palmer  :  about the author
fifty shades of grey - e l james  :  appendix
grey - e l james  :  appendix
outlander voyager - diana galbaldon  :  appendix


In [None]:
# load profanity file
curses = pd.read_csv(path + 'other/profanity.csv')
curses.drop('Unnamed: 1', inplace=True, axis=1)
bad_words = curses.word.T.tolist()

In [189]:
# function bank for creating metrics

def avg_sentence_len(text):
    word_counts = []
    tokenizer = RegexpTokenizer(r'\w+')
    sent_detect = PunktSentenceTokenizer()
    sentences = sent_detect.sentences_from_text(text)
    for sentence in sentences:
        words = tokenizer.tokenize(sentence)
        word_counts.append(len(words))
    avg_word_count = sum(word_counts)/len(word_counts)  
    return avg_word_count

#--------------------------------

def get_token_words(text):
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    return words

def word_count(text):
    words = get_token_words(text)
    return len(words)

def avg_word_len(text):
    letter_counts = []
    words = get_token_words(text)
    for word in words:
        letter_counts.append(len(word))
    avg_word_len = sum(letter_counts)/len(letter_counts)
    return avg_word_len

def profanity_counter(text):
    i=0
    words = get_token_words(text)
    for word in words:
        if word in bad_words:
            i+=1       
    return i

def lex_div(text):
    words = get_token_words(text)
    lexical_diversity = 1.0 * len(set(words)) / len(words)
    return lexical_diversity

#--------------------------------

def to_blob(text):
    blob = TextBlob(text)
    return blob

def assign_polarity(text):
    blob = to_blob(text)
    return blob.sentiment.polarity

def assign_subjectivity(text):
    blob = to_blob(text)
    return blob.sentiment.subjectivity

#---------------------------------

def parse_pos(df,field):
    for i in range(0,len(df)):
        blob = TextBlob(df.ix[i,field])
        tags = blob.tags
        df_tags = pd.DataFrame(tags)
        df_tags = df_tags.groupby([1]).count().reset_index()
        for x in range(0,len(df_tags)):
                df.ix[i, df_tags.ix[x,1] ] = df_tags.ix[x,0]
        df.fillna(0,inplace=True)

In [162]:
# create new columns of metrics
# note: this is take a very long time to run

df['avg_sent_len'] = df['body'].map(avg_sentence_len)
df['word_count'] = df['body'].map(word_count)
df['avg_word_len'] = df['body'].map(avg_word_len)
df['lex_diversity'] = df['body'].map(lex_div)
df['polarity'] = df['body'].map(assign_polarity)
df['subjectivity'] = df['body'].map(assign_subjectivity)
df['profanity'] = df['body'].map(profanity_counter)
df['profane'] = 1. * df['profanity']/df['word_count']
parse_pos(df,'body')

In [None]:
df.rename(columns={'CC':'conj_coord', 'CD':'number', 'DT':'determiner', 'EX':'exist_there',
                  'FW':'foreign_word','IN':'conj_sub_prep','JJ':'adj','JJR':'adj_compare',
                 'JJS':'adj_sup','MD':'verb_aux',  'NN':'noun','NNP':'noun_prop',
                'NNPS':'noun_prop_pural',  'NNS':'noun_plural', 'PDT':'predeterm','PRP':'pronoun_pers',
                'PRP$':'pronoun_poss',  'RB':'adv','RBR':'adv_compare','RBS':'adv_sup',
                  'RP':'adv_part', 'TO':'inf_to',  'UH':'interject','VB':'verb_base',
                 'VBD':'verb_past','VBG':'verb_ger','VBN':'verb_pp','VBP':'verb_sing_pres',
                 'VBZ':'verb_3rd_sing_pres','WDT':'wh_determ','WP':'wh_pronoun','WP$':'wh_poss',
                 'WRB':'wh_adv','POS':'poss_ending','SYM':'symbol','LS':'list_marker'}, inplace=True)

In [177]:
df.head(2)

Unnamed: 0,best_seller,body,sci_fi,title,avg_sent_len,word_count,avg_word_len,lex_diversity
0,1,How to explain? How to describe? Even the omni...,1,a fire upon the deep - vernor vinge,11,207548,4,0.064477
1,1,"The body lay naked and facedown, a deathly gr...",1,ancillary justice - ann leckie,11,108263,4,0.079815


In [None]:
df.to_csv(path + 'df.csv') # save to file so we don't have to do that again

In [None]:
bs = df[(df['best_seller']==1)]
f = df[(df['best_seller']==0)]
sfbs = df[((df['best_seller']==1) & (df['sci_fi']==1))]
rmbs = df[((df['best_seller']==1) & (df['sci_fi']==0))]
sff = df[((df['best_seller']==0) & (df['sci_fi']==1))]
rmf = df[((df['best_seller']==0) & (df['sci_fi']==0))]

In [1]:
df.describe().T

In [2]:
sfbs.describe().T

In [3]:
sff.describe().T

In [4]:
rmbs.describe().T

In [5]:
rmf.describe().T

In [None]:
stop_words = text.ENGLISH_STOP_WORDS.union(['232'])
char_name_stop = stop_words.union(['awn','muad','dib','hock','seng','nell','pham','nuwen', 'enzo','anaander','mianaai',
                                   'rautha','feyd','willoughby','brandon','john','dashwood','jessica','fang','hiro',
                                  'steele','anastasia','rochester','jennings','ian','middleton','tara','jean','fairfax',
                                  'mcgraw','finkle', 'dearborne','dearbornes','peterby','anne','alice','henry','simon',
                                  'gavin','marco','bruce','catherine','nicky','brent','reverend','bene','gesserit','clef',
                                  'radch','kei',"sjandra",'goodbody','paul','robert','horza','ender','ruby','travis','dar',
                                  'miller','holden','travis','leto','mike','anna','justus','swan','jeff','onor','stilgar',
                                  'sam','jamie','sasha','riley','nerezza','christian','grey'])

In [178]:
# initial feature review to see anything interesting

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.feature_extraction import text 

vect = TfidfVectorizer(ngram_range=(1,1), stop_words=char_name_stop)

summaries_sfbs = "".join(sfbs['body'])
summaries_rmbs = "".join(rmbs['body'])
summaries_sff = "".join(sff['body'])
summaries_rmf = "".join(rmf['body'])
summaries_bs = "".join(bs['body'])
summaries_f = "".join(f['body'])

ngrams_summaries_sfbs = vect.build_analyzer()(summaries_sfbs)
ngrams_summaries_rmbs = vect.build_analyzer()(summaries_rmbs)
ngrams_summaries_sff = vect.build_analyzer()(summaries_sff)
ngrams_summaries_rmf = vect.build_analyzer()(summaries_rmf)
ngrams_summaries_bs = vect.build_analyzer()(summaries_bs)
ngrams_summaries_f = vect.build_analyzer()(summaries_f)

In [174]:
# sci-fi books over-use character names
# also a long time ago, far far away
Counter(ngrams_summaries_sfbs).most_common(20)

[(u'hock seng', 579),
 (u'shook head', 470),
 (u'lieutenant awn', 397),
 (u'old man', 302),
 (u'long time', 266),
 (u'bene gesserit', 265),
 (u'princess nell', 242),
 (u'white shirts', 241),
 (u'years ago', 232),
 (u'judge fang', 210),
 (u'anaander mianaai', 201),
 (u'hiro says', 187),
 (u'muad dib', 181),
 (u'feyd rautha', 179),
 (u'thousand years', 171),
 (u'far away', 160),
 (u'reverend mother', 147),
 (u'pham nuwen', 147),
 (u'hong kong', 140),
 (u'uncle enzo', 132)]

In [180]:
Counter(ngrams_summaries_sff).most_common(20)

[(u'sai ias', 295),
 (u'shook head', 163),
 (u'dr clef', 162),
 (u'hell ship', 140),
 (u'father marco', 104),
 (u'explorer 410', 96),
 (u'im sure', 93),
 (u'im sorry', 73),
 (u'big guy', 58),
 (u'bruce lucent', 57),
 (u'long time', 55),
 (u'dr simon', 53),
 (u'alice gavin', 50),
 (u'clockwork plague', 48),
 (u'years ago', 47),
 (u'death ship', 47),
 (u'henry archer', 45),
 (u'gavin alice', 45),
 (u'make sure', 44),
 (u'far away', 43)]

In [181]:
# romance titles have some classics --> deep breath, closed eyes, emotional gestures
Counter(ngrams_summaries_rmbs).most_common(20)

[(u'shook head', 518),
 (u'im going', 361),
 (u'deep breath', 255),
 (u'im sorry', 245),
 (u'im sure', 201),
 (u'closed eyes', 191),
 (u'long time', 179),
 (u'blue eyes', 162),
 (u'youre going', 160),
 (u'took deep', 157),
 (u'took deep breath', 140),
 (u'shaking head', 130),
 (u'living room', 129),
 (u'open door', 119),
 (u'oh aye', 112),
 (u'eyes closed', 103),
 (u'turned away', 98),
 (u'yes sir', 96),
 (u'oh yes', 96),
 (u'ive seen', 95)]

In [182]:
Counter(ngrams_summaries_rmf).most_common(20)

[(u'lord dearborne', 182),
 (u'mrs goodbody', 151),
 (u'shook head', 139),
 (u'new york', 83),
 (u'im sorry', 80),
 (u'maverick junction', 67),
 (u'lady catherine', 63),
 (u'im sure', 59),
 (u'parking lot', 50),
 (u'lord peterby', 49),
 (u'lord dearbornes', 48),
 (u'dr brent', 46),
 (u'closed eyes', 44),
 (u'lady peterby', 41),
 (u'oh god', 40),
 (u'night stand', 39),
 (u'lady anne', 38),
 (u'rolled eyes', 38),
 (u'wasnt sure', 36),
 (u'im gonna', 36)]

In [183]:

char_name_stop = stop_words.union(['awn','muad','dib','hock','seng','nell','pham','nuwen', 'enzo','anaander','mianaai',
                                   'rautha','feyd','willoughby','brandon','john','dashwood','jessica','fang','hiro',
                                  'steele','anastasia','rochester','jennings','ian','middleton','tara','jean','fairfax',
                                  'mcgraw','finkle', 'dearborne','dearbornes','peterby','anne','alice','henry','simon',
                                  'gavin','marco','bruce','catherine','nicky','brent'])

vect = TfidfVectorizer(ngram_range=(2,3), stop_words=char_name_stop)

summaries_sfbs = "".join(sfbs['body'])
summaries_rmbs = "".join(rmbs['body'])
summaries_sff = "".join(sff['body'])
summaries_rmf = "".join(rmf['body'])

ngrams_summaries_sfbs = vect.build_analyzer()(summaries_sfbs)
ngrams_summaries_rmbs = vect.build_analyzer()(summaries_rmbs)
ngrams_summaries_sff = vect.build_analyzer()(summaries_sff)
ngrams_summaries_rmf = vect.build_analyzer()(summaries_rmf)

In [184]:
Counter(ngrams_summaries_sfbs).most_common(20)

[(u'shook head', 470),
 (u'old man', 302),
 (u'long time', 266),
 (u'bene gesserit', 265),
 (u'white shirts', 241),
 (u'years ago', 232),
 (u'thousand years', 171),
 (u'far away', 160),
 (u'reverend mother', 147),
 (u'hong kong', 140),
 (u'shakes head', 124),
 (u'make sure', 124),
 (u'sjandra kei', 121),
 (u'lord radch', 120),
 (u'closed eyes', 119),
 (u'long ago', 119),
 (u'right hand', 116),
 (u'half hour', 115),
 (u'years old', 114),
 (u'im sure', 106)]

In [185]:
Counter(ngrams_summaries_sff).most_common(20)

[(u'sai ias', 295),
 (u'shook head', 163),
 (u'dr clef', 162),
 (u'hell ship', 140),
 (u'explorer 410', 96),
 (u'im sure', 93),
 (u'im sorry', 73),
 (u'big guy', 58),
 (u'long time', 55),
 (u'clockwork plague', 48),
 (u'years ago', 47),
 (u'death ship', 47),
 (u'make sure', 44),
 (u'far away', 43),
 (u'impossible cube', 43),
 (u'dar frowned', 41),
 (u'im going', 41),
 (u'ive got', 40),
 (u'dar nodded', 40),
 (u'young man', 40)]

In [186]:
# this is funnier
Counter(ngrams_summaries_rmbs).most_common(30)

[(u'shook head', 518),
 (u'im going', 361),
 (u'deep breath', 255),
 (u'im sorry', 245),
 (u'im sure', 201),
 (u'closed eyes', 191),
 (u'long time', 179),
 (u'blue eyes', 162),
 (u'youre going', 160),
 (u'took deep', 157),
 (u'took deep breath', 140),
 (u'shaking head', 130),
 (u'living room', 129),
 (u'open door', 119),
 (u'oh aye', 112),
 (u'eyes closed', 103),
 (u'turned away', 98),
 (u'yes sir', 96),
 (u'oh yes', 96),
 (u'ive seen', 95),
 (u'young man', 95),
 (u'im glad', 95),
 (u'ye ken', 94),
 (u'new york', 92),
 (u'little bit', 91),
 (u'gray eyes', 88),
 (u'eyes fixed', 87),
 (u'tell ye', 85),
 (u'cleared throat', 84),
 (u'ive got', 81)]

In [187]:
Counter(ngrams_summaries_rmf).most_common(30)

[(u'mrs goodbody', 151),
 (u'shook head', 139),
 (u'new york', 83),
 (u'im sorry', 80),
 (u'maverick junction', 67),
 (u'lady catherine', 63),
 (u'im sure', 59),
 (u'parking lot', 50),
 (u'dr brent', 46),
 (u'closed eyes', 44),
 (u'oh god', 40),
 (u'night stand', 39),
 (u'rolled eyes', 38),
 (u'wasnt sure', 36),
 (u'im gonna', 36),
 (u'opened door', 35),
 (u'im going', 35),
 (u'mr little', 35),
 (u'youre right', 33),
 (u'sacre bleu', 31),
 (u'living room', 30),
 (u'ive got', 30),
 (u'uncle nicky', 29),
 (u'held hand', 29),
 (u'lone tree', 29),
 (u'ice cream', 28),
 (u'oh yeah', 27),
 (u'make sure', 27),
 (u'youre going', 26),
 (u'high school', 25)]